devadigaprathamesh commited on
Commit
6f257b9
·
verified ·
1 Parent(s): ed45092

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -1,35 +1,5 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
  *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ adapter_model.safetensors filter=lfs diff=lfs merge=lfs -text
2
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  *.pt filter=lfs diff=lfs merge=lfs -text
5
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
README.md ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # JurisQwen: Legal Domain Fine-tuned Qwen2.5-7B Model
2
+
3
+ ## Overview
4
+ JurisQwen is a specialized legal domain language model based on Qwen2.5-7B, fine-tuned on Indian legal datasets. This model is designed to assist with legal queries, document analysis, and providing information about Indian law.
5
+
6
+ ## Model Details
7
+
8
+ ### Model Description
9
+ - **Developed by:** [Your Name/Organization]
10
+ - **Base Model:** Qwen2.5-7B by Qwen
11
+ - **Model Type:** Language Model with LoRA fine-tuning
12
+ - **Language:** English with focus on Indian legal terminology
13
+ - **License:** [Specify License - inherited from Qwen2.5 or your custom license]
14
+ - **Finetuned from model:** Qwen/Qwen2.5-7B
15
+ - **Framework:** PEFT 0.15.1 with Unsloth optimization
16
+
17
+ ### Training Dataset
18
+ The model was fine-tuned on the "viber1/indian-law-dataset" which contains instruction-response pairs focused on Indian legal knowledge and terminology.
19
+
20
+ ## Technical Specifications
21
+
22
+ ### Model Architecture
23
+ - Base model: Qwen2.5-7B
24
+ - Fine-tuning method: LoRA (Low-Rank Adaptation)
25
+ - LoRA configuration:
26
+ - Rank (r): 32
27
+ - Alpha: 64
28
+ - Dropout: 0.05
29
+ - Target modules: q_proj, k_proj, v_proj, o_proj, gate_proj, up_proj, down_proj
30
+
31
+ ### Training Procedure
32
+ - **Training Infrastructure:** NVIDIA A100-40GB GPU
33
+ - **Quantization:** 4-bit quantization using bitsandbytes
34
+ - **Mixed Precision:** bfloat16
35
+ - **Attention Implementation:** Flash Attention 2
36
+ - **Training Hyperparameters:**
37
+ - Epochs: 3
38
+ - Batch size: 16
39
+ - Gradient accumulation steps: 2
40
+ - Learning rate: 2e-4
41
+ - Weight decay: 0.001
42
+ - Scheduler: Cosine with 10% warmup
43
+ - Optimizer: AdamW 8-bit
44
+ - Maximum sequence length: 4096
45
+ - TF32 enabled for A100
46
+
47
+ ### Deployment Infrastructure
48
+ - Deployed using Modal cloud platform
49
+ - GPU: NVIDIA A100-40GB
50
+ - Persistent volume storage for model checkpoints
51
+
52
+ ## Usage
53
+
54
+ ### Setting Up the Environment
55
+ This model is deployed using Modal. To use it, you'll need to:
56
+
57
+ 1. Install Modal:
58
+ ```bash
59
+ pip install modal
60
+ ```
61
+
62
+ 2. Authenticate with Modal:
63
+ ```bash
64
+ modal token new
65
+ ```
66
+
67
+ 3. Deploy the application:
68
+ ```bash
69
+ python app.py
70
+ ```
71
+
72
+ ### Running Fine-tuning
73
+ To run the fine-tuning process:
74
+
75
+ ```python
76
+ from app import app, finetune_qwen
77
+
78
+ # Deploy the app
79
+ app.deploy()
80
+
81
+ # Run fine-tuning
82
+ result = finetune_qwen.remote()
83
+ print(f"Fine-tuning result: {result}")
84
+ ```
85
+
86
+ ### Inference
87
+ To run inference with the fine-tuned model:
88
+
89
+ ```python
90
+ from app import app, test_inference
91
+
92
+ # Example legal query
93
+ response = test_inference.remote("What are the key provisions of the Indian Contract Act?")
94
+ print(response)
95
+ ```
96
+
97
+ ## Input Format
98
+ The model uses the following format for prompts:
99
+ ```
100
+ <|im_start|>user
101
+ [Your legal question here]
102
+ <|im_end|>
103
+ ```
104
+
105
+ The model will respond with:
106
+ ```
107
+ <|im_start|>assistant
108
+ [Legal response]
109
+ <|im_end|>
110
+ ```
111
+
112
+ ## Limitations and Biases
113
+ - The model is specifically trained on Indian legal data and may not generalize well to other legal systems
114
+ - Legal advice provided by the model should not be considered as professional legal counsel
115
+ - The model may exhibit biases present in the training data
116
+ - Performance on complex or novel legal scenarios not present in the training data may be limited
117
+
118
+ ## Recommendations
119
+ - Users should validate important legal information with qualified legal professionals
120
+ - Always cross-reference model outputs with authoritative legal sources
121
+ - Be aware that legal interpretations may vary and the model provides one possible interpretation
122
+
123
+ ## Environmental Impact
124
+ - Hardware: NVIDIA A100-40GB GPU
125
+ - Training time: Approximately 3-5 hours
126
+ - Cloud Provider: Modal
127
+
128
+ ## Citation
129
+ If you use this model in your research, please cite:
130
+
131
+ ```
132
+ @software{JurisQwen,
133
+ author = {[Prathamesh Devadiga]},
134
+ title = {JurisQwen: Indian Legal Domain Fine-tuned Qwen2.5-7B Model},
135
+ year = {2025},
136
+ url = {[https://github.com/devadigapratham/JurisQwen]}
137
+ }
138
+ ```
139
+
140
+ ## Acknowledgments
141
+ - Qwen team for the original Qwen2.5-7B model
142
+ - Unsloth for optimization tools
143
+ - Modal for deployment infrastructure
144
+ - Creator of the "viber1/indian-law-dataset"
adapter_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "unsloth/qwen2.5-7b-unsloth-bnb-4bit",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 64,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.05,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "r": 32,
24
+ "rank_pattern": {},
25
+ "revision": null,
26
+ "target_modules": [
27
+ "q_proj",
28
+ "o_proj",
29
+ "v_proj",
30
+ "gate_proj",
31
+ "k_proj",
32
+ "up_proj",
33
+ "down_proj"
34
+ ],
35
+ "task_type": "CAUSAL_LM",
36
+ "trainable_token_indices": null,
37
+ "use_dora": false,
38
+ "use_rslora": false
39
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6b6b40b7f5a2311bb1a00e2d2665129ed926d4b87fd04578ec504452d5d5b84
3
+ size 134
added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
finetuning.py ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import modal
2
+ import os
3
+ from pathlib import Path
4
+
5
+ # Define Modal app
6
+ app = modal.App("qwen-law-finetuning")
7
+
8
+ # Create a custom image with all dependencies
9
+ # Breaking down pip installs to make the build more reliable
10
+ # Use Modal's CUDA image which has the CUDA environment pre-configured
11
+ image = (
12
+ modal.Image.from_registry(
13
+ "nvidia/cuda:12.1.0-devel-ubuntu22.04",
14
+ add_python="3.10"
15
+ )
16
+ .apt_install(["git", "build-essential", "ninja-build"])
17
+ .pip_install("unsloth", "datasets") # Already correct
18
+ .pip_install("torch>=2.0.1", "transformers>=4.33.0") # Fixed
19
+ .pip_install("peft>=0.5.0", "trl>=0.7.1", "tensorboard") # Fixed
20
+ .pip_install("bitsandbytes>=0.41.1", "accelerate>=0.23.0") # Fixed
21
+ .pip_install("xformers>=0.0.21", "einops", "sentencepiece", "protobuf") # Fixed
22
+ .pip_install("flash-attn>=2.3.0") # Already correct (single package)
23
+ .add_local_dir(".", remote_path="/root/code")
24
+ )
25
+
26
+ # Add local directory to the image - using add_local_dir as recommended
27
+ image = image.add_local_dir(".", remote_path="/root/code")
28
+
29
+ # Define volume to persist model checkpoints
30
+ volume = modal.Volume.from_name("finetune-volume", create_if_missing=True)
31
+ VOLUME_PATH = "/data"
32
+
33
+ @app.function(
34
+ image=image,
35
+ gpu="A100-40GB",
36
+ timeout=60 * 60 * 5, # 5 hour timeout
37
+ volumes={VOLUME_PATH: volume},
38
+ )
39
+ def finetune_qwen():
40
+ import torch
41
+ from datasets import load_dataset
42
+ from unsloth import FastLanguageModel
43
+ from transformers import TrainingArguments
44
+ from trl import SFTTrainer
45
+ import os
46
+
47
+ # Set working directory
48
+ os.chdir("/root/code")
49
+
50
+ # Create output directory in the volume
51
+ output_dir = os.path.join(VOLUME_PATH, "JurisQwen")
52
+ os.makedirs(output_dir, exist_ok=True)
53
+
54
+ print("Loading dataset...")
55
+ # Load the dataset
56
+ ds = load_dataset("viber1/indian-law-dataset")
57
+
58
+ # Format the dataset for instruction fine-tuning
59
+ def format_instruction(example):
60
+ return {
61
+ "text": f"<|im_start|>user\n{example['Instruction']}<|im_end|>\n<|im_start|>assistant\n{example['Response']}<|im_end|>"
62
+ }
63
+
64
+ # Apply formatting
65
+ formatted_ds = ds.map(format_instruction)
66
+ train_dataset = formatted_ds["train"]
67
+
68
+ # A100-optimized parameters
69
+ max_seq_length = 4096 # Increased for A100's larger memory
70
+ model_id = "Qwen/Qwen2.5-7B"
71
+
72
+ print("Loading model...")
73
+ # Initialize model with Unsloth, optimized for A100
74
+ model, tokenizer = FastLanguageModel.from_pretrained(
75
+ model_id,
76
+ max_seq_length=max_seq_length,
77
+ load_in_4bit=True, # Quantized training for memory efficiency
78
+ attn_implementation="flash_attention_2", # Flash Attention 2 for A100
79
+ dtype=torch.bfloat16, # Explicitly use bfloat16 for A100
80
+ )
81
+
82
+ # Prepare model for training with optimized parameters for A100
83
+ model = FastLanguageModel.get_peft_model(
84
+ model,
85
+ r=32, # Increased LoRA rank for A100
86
+ target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
87
+ "gate_proj", "up_proj", "down_proj"],
88
+ lora_alpha=64, # Increased alpha for better training
89
+ lora_dropout=0.05,
90
+ bias="none",
91
+ use_gradient_checkpointing="unsloth", # Enables efficient training on long sequences
92
+ )
93
+
94
+ # Set training arguments optimized for A100
95
+ training_args = TrainingArguments(
96
+ output_dir=os.path.join(VOLUME_PATH, "checkpoints"),
97
+ num_train_epochs=3,
98
+ per_device_train_batch_size=16, # Increased for A100
99
+ gradient_accumulation_steps=2, # Reduced due to larger batch size
100
+ optim="adamw_8bit", # 8-bit Adam optimizer for efficiency
101
+ learning_rate=2e-4,
102
+ weight_decay=0.001,
103
+ lr_scheduler_type="cosine",
104
+ warmup_ratio=0.1,
105
+ bf16=True, # Enable bf16 (A100 supports it)
106
+ fp16=False, # Disable fp16 when using bf16
107
+ logging_steps=10,
108
+ save_strategy="epoch",
109
+ report_to="tensorboard",
110
+ tf32=True, # Enable TF32 for A100
111
+ )
112
+
113
+ print("Preparing trainer...")
114
+ # Using SFTTrainer for better performance
115
+ trainer = SFTTrainer(
116
+ model=model,
117
+ tokenizer=tokenizer,
118
+ train_dataset=train_dataset,
119
+ dataset_text_field="text",
120
+ max_seq_length=max_seq_length,
121
+ args=training_args,
122
+ packing=True, # Enable packing for faster training
123
+ )
124
+
125
+ # Train the model
126
+ print("Starting training...")
127
+ trainer.train()
128
+ print("Training completed!")
129
+
130
+ # Save the fine-tuned model
131
+ print(f"Saving model to {output_dir}")
132
+ model.save_pretrained(output_dir)
133
+ tokenizer.save_pretrained(output_dir)
134
+
135
+ # Test inference with the fine-tuned model
136
+ print("Testing inference...")
137
+ FastLanguageModel.for_inference(model) # Enable faster inference
138
+ test_prompt = "<|im_start|>user\nWhat are the key provisions of the Indian Contract Act?<|im_end|>"
139
+ inputs = tokenizer([test_prompt], return_tensors="pt").to("cuda")
140
+ outputs = model.generate(**inputs, max_new_tokens=512)
141
+ print("Generated response:")
142
+ print(tokenizer.decode(outputs[0]))
143
+
144
+ return f"Model successfully trained and saved to {output_dir}"
145
+
146
+ @app.function(
147
+ image=image,
148
+ gpu="A100-40GB",
149
+ timeout=60 * 10, # 10 minute timeout
150
+ volumes={VOLUME_PATH: volume},
151
+ )
152
+ def test_inference(prompt: str):
153
+ from unsloth import FastLanguageModel
154
+ import torch
155
+ import os
156
+
157
+ # Load the fine-tuned model
158
+ model_path = os.path.join(VOLUME_PATH, "JurisQwen")
159
+
160
+ print(f"Loading model from {model_path}")
161
+ model, tokenizer = FastLanguageModel.from_pretrained(
162
+ model_path,
163
+ max_seq_length=4096,
164
+ attn_implementation="flash_attention_2",
165
+ dtype=torch.bfloat16,
166
+ )
167
+
168
+ # Enable fast inference
169
+ FastLanguageModel.for_inference(model)
170
+
171
+ # Format the prompt
172
+ formatted_prompt = f"<|im_start|>user\n{prompt}<|im_end|>"
173
+ inputs = tokenizer([formatted_prompt], return_tensors="pt").to("cuda")
174
+
175
+ # Generate response
176
+ outputs = model.generate(**inputs, max_new_tokens=512)
177
+ response = tokenizer.decode(outputs[0])
178
+
179
+ return response
180
+
181
+ # For debugging: This will show logs during the image build process
182
+ @app.local_entrypoint()
183
+ def main():
184
+ print("Starting fine-tuning process...")
185
+ app.deploy()
186
+ result = finetune_qwen.remote()
187
+ print(f"Fine-tuning result: {result}")
188
+
189
+
190
+ if __name__ == "__main__":
191
+ main()
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|vision_pad|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
tokenizer_config.json ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|endoftext|>",
200
+ "errors": "replace",
201
+ "extra_special_tokens": {},
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|vision_pad|>",
204
+ "padding_side": "right",
205
+ "split_special_tokens": false,
206
+ "tokenizer_class": "Qwen2Tokenizer",
207
+ "unk_token": null
208
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff