jgayed commited on
Commit
e496670
·
verified ·
1 Parent(s): 67eb872

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +3 -0
  2. README.md +58 -0
  3. adapter_config.json +461 -0
  4. adapter_model.safetensors +3 -0
  5. added_tokens.json +3 -0
  6. all_results.json +9 -0
  7. chat_template.json +3 -0
  8. checkpoint-100/README.md +202 -0
  9. checkpoint-100/adapter_config.json +461 -0
  10. checkpoint-100/adapter_model.safetensors +3 -0
  11. checkpoint-100/added_tokens.json +3 -0
  12. checkpoint-100/chat_template.json +3 -0
  13. checkpoint-100/optimizer.pt +3 -0
  14. checkpoint-100/preprocessor_config.json +29 -0
  15. checkpoint-100/processor_config.json +4 -0
  16. checkpoint-100/rng_state.pth +3 -0
  17. checkpoint-100/scheduler.pt +3 -0
  18. checkpoint-100/special_tokens_map.json +42 -0
  19. checkpoint-100/tokenizer.json +3 -0
  20. checkpoint-100/tokenizer.model +3 -0
  21. checkpoint-100/tokenizer_config.json +0 -0
  22. checkpoint-100/trainer_state.json +194 -0
  23. checkpoint-100/training_args.bin +3 -0
  24. checkpoint-120/README.md +202 -0
  25. checkpoint-120/adapter_config.json +461 -0
  26. checkpoint-120/adapter_model.safetensors +3 -0
  27. checkpoint-120/added_tokens.json +3 -0
  28. checkpoint-120/chat_template.json +3 -0
  29. checkpoint-120/optimizer.pt +3 -0
  30. checkpoint-120/preprocessor_config.json +29 -0
  31. checkpoint-120/processor_config.json +4 -0
  32. checkpoint-120/rng_state.pth +3 -0
  33. checkpoint-120/scheduler.pt +3 -0
  34. checkpoint-120/special_tokens_map.json +42 -0
  35. checkpoint-120/tokenizer.json +3 -0
  36. checkpoint-120/tokenizer.model +3 -0
  37. checkpoint-120/tokenizer_config.json +0 -0
  38. checkpoint-120/trainer_state.json +226 -0
  39. checkpoint-120/training_args.bin +3 -0
  40. llamaboard_config.yaml +78 -0
  41. preprocessor_config.json +29 -0
  42. processor_config.json +4 -0
  43. running_log.txt +0 -0
  44. special_tokens_map.json +42 -0
  45. tokenizer.json +3 -0
  46. tokenizer.model +3 -0
  47. tokenizer_config.json +0 -0
  48. train_results.json +9 -0
  49. trainer_log.jsonl +25 -0
  50. trainer_state.json +236 -0
.gitattributes CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ checkpoint-100/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ checkpoint-120/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ license: other
4
+ base_model: google/gemma-3-27b-it
5
+ tags:
6
+ - llama-factory
7
+ - lora
8
+ - generated_from_trainer
9
+ model-index:
10
+ - name: gemma5
11
+ results: []
12
+ ---
13
+
14
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
+ should probably proofread and complete it, then remove this comment. -->
16
+
17
+ # gemma5
18
+
19
+ This model is a fine-tuned version of [google/gemma-3-27b-it](https://huggingface.co/google/gemma-3-27b-it) on the ets120 dataset.
20
+
21
+ ## Model description
22
+
23
+ More information needed
24
+
25
+ ## Intended uses & limitations
26
+
27
+ More information needed
28
+
29
+ ## Training and evaluation data
30
+
31
+ More information needed
32
+
33
+ ## Training procedure
34
+
35
+ ### Training hyperparameters
36
+
37
+ The following hyperparameters were used during training:
38
+ - learning_rate: 5e-05
39
+ - train_batch_size: 1
40
+ - eval_batch_size: 8
41
+ - seed: 42
42
+ - gradient_accumulation_steps: 8
43
+ - total_train_batch_size: 8
44
+ - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
45
+ - lr_scheduler_type: cosine
46
+ - num_epochs: 8.0
47
+
48
+ ### Training results
49
+
50
+
51
+
52
+ ### Framework versions
53
+
54
+ - PEFT 0.12.0
55
+ - Transformers 4.50.0
56
+ - Pytorch 2.6.0+cu124
57
+ - Datasets 3.3.2
58
+ - Tokenizers 0.21.0
adapter_config.json ADDED
@@ -0,0 +1,461 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "google/gemma-3-27b-it",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 128,
14
+ "lora_dropout": 0,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 64,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "language_model.model.layers.18.mlp.up_proj",
24
+ "language_model.model.layers.14.self_attn.q_proj",
25
+ "language_model.model.layers.23.mlp.down_proj",
26
+ "language_model.model.layers.5.self_attn.q_proj",
27
+ "language_model.model.layers.48.self_attn.k_proj",
28
+ "language_model.model.layers.29.self_attn.v_proj",
29
+ "language_model.model.layers.39.self_attn.v_proj",
30
+ "language_model.model.layers.5.self_attn.v_proj",
31
+ "language_model.model.layers.60.mlp.gate_proj",
32
+ "language_model.model.layers.17.self_attn.o_proj",
33
+ "language_model.model.layers.60.mlp.down_proj",
34
+ "language_model.model.layers.33.mlp.up_proj",
35
+ "language_model.model.layers.53.self_attn.q_proj",
36
+ "language_model.model.layers.27.mlp.down_proj",
37
+ "language_model.model.layers.57.self_attn.v_proj",
38
+ "language_model.model.layers.26.mlp.up_proj",
39
+ "language_model.model.layers.10.mlp.gate_proj",
40
+ "language_model.model.layers.23.mlp.up_proj",
41
+ "language_model.model.layers.57.self_attn.q_proj",
42
+ "language_model.model.layers.57.mlp.up_proj",
43
+ "language_model.model.layers.39.mlp.gate_proj",
44
+ "language_model.model.layers.26.mlp.down_proj",
45
+ "language_model.model.layers.11.mlp.up_proj",
46
+ "language_model.model.layers.25.mlp.up_proj",
47
+ "language_model.model.layers.6.mlp.gate_proj",
48
+ "language_model.model.layers.36.mlp.gate_proj",
49
+ "language_model.model.layers.2.self_attn.o_proj",
50
+ "language_model.model.layers.30.self_attn.o_proj",
51
+ "language_model.model.layers.4.mlp.gate_proj",
52
+ "language_model.model.layers.13.mlp.up_proj",
53
+ "language_model.model.layers.52.mlp.down_proj",
54
+ "language_model.model.layers.4.mlp.up_proj",
55
+ "language_model.model.layers.32.self_attn.v_proj",
56
+ "language_model.model.layers.1.self_attn.o_proj",
57
+ "language_model.model.layers.22.mlp.down_proj",
58
+ "language_model.model.layers.45.mlp.down_proj",
59
+ "language_model.model.layers.46.self_attn.o_proj",
60
+ "language_model.model.layers.38.self_attn.o_proj",
61
+ "language_model.model.layers.23.self_attn.q_proj",
62
+ "language_model.model.layers.46.mlp.up_proj",
63
+ "language_model.model.layers.49.mlp.down_proj",
64
+ "language_model.model.layers.40.self_attn.k_proj",
65
+ "language_model.model.layers.24.self_attn.q_proj",
66
+ "language_model.model.layers.9.self_attn.o_proj",
67
+ "language_model.model.layers.37.mlp.gate_proj",
68
+ "language_model.model.layers.36.self_attn.k_proj",
69
+ "language_model.model.layers.49.self_attn.k_proj",
70
+ "language_model.model.layers.24.self_attn.v_proj",
71
+ "language_model.model.layers.1.mlp.down_proj",
72
+ "language_model.model.layers.10.mlp.down_proj",
73
+ "language_model.model.layers.61.mlp.gate_proj",
74
+ "language_model.model.layers.34.self_attn.o_proj",
75
+ "language_model.model.layers.9.mlp.up_proj",
76
+ "language_model.model.layers.59.self_attn.v_proj",
77
+ "language_model.model.layers.30.self_attn.q_proj",
78
+ "language_model.model.layers.44.mlp.gate_proj",
79
+ "language_model.model.layers.46.mlp.down_proj",
80
+ "language_model.model.layers.46.self_attn.k_proj",
81
+ "language_model.model.layers.28.self_attn.v_proj",
82
+ "language_model.model.layers.58.self_attn.o_proj",
83
+ "language_model.model.layers.13.self_attn.o_proj",
84
+ "language_model.model.layers.24.mlp.up_proj",
85
+ "language_model.model.layers.0.self_attn.q_proj",
86
+ "language_model.model.layers.58.mlp.up_proj",
87
+ "language_model.model.layers.12.mlp.down_proj",
88
+ "language_model.model.layers.43.mlp.up_proj",
89
+ "language_model.model.layers.44.self_attn.k_proj",
90
+ "language_model.model.layers.17.self_attn.q_proj",
91
+ "language_model.model.layers.59.mlp.up_proj",
92
+ "language_model.model.layers.50.mlp.down_proj",
93
+ "language_model.model.layers.24.self_attn.k_proj",
94
+ "language_model.model.layers.59.self_attn.k_proj",
95
+ "language_model.model.layers.1.mlp.up_proj",
96
+ "language_model.model.layers.58.self_attn.q_proj",
97
+ "language_model.model.layers.51.mlp.gate_proj",
98
+ "language_model.model.layers.4.self_attn.v_proj",
99
+ "language_model.model.layers.45.mlp.gate_proj",
100
+ "language_model.model.layers.8.mlp.gate_proj",
101
+ "language_model.model.layers.43.self_attn.v_proj",
102
+ "language_model.model.layers.51.self_attn.v_proj",
103
+ "language_model.model.layers.37.self_attn.v_proj",
104
+ "language_model.model.layers.51.self_attn.q_proj",
105
+ "language_model.model.layers.51.self_attn.o_proj",
106
+ "language_model.model.layers.33.mlp.down_proj",
107
+ "language_model.model.layers.44.self_attn.v_proj",
108
+ "language_model.model.layers.18.self_attn.o_proj",
109
+ "language_model.model.layers.14.self_attn.k_proj",
110
+ "language_model.model.layers.55.self_attn.k_proj",
111
+ "language_model.model.layers.10.self_attn.q_proj",
112
+ "language_model.model.layers.34.mlp.up_proj",
113
+ "language_model.model.layers.27.self_attn.v_proj",
114
+ "language_model.model.layers.20.self_attn.o_proj",
115
+ "language_model.model.layers.33.self_attn.k_proj",
116
+ "language_model.model.layers.44.mlp.up_proj",
117
+ "language_model.model.layers.3.self_attn.o_proj",
118
+ "language_model.model.layers.15.mlp.up_proj",
119
+ "language_model.model.layers.17.mlp.gate_proj",
120
+ "language_model.model.layers.17.mlp.up_proj",
121
+ "language_model.model.layers.55.mlp.up_proj",
122
+ "language_model.model.layers.0.self_attn.v_proj",
123
+ "language_model.model.layers.56.self_attn.v_proj",
124
+ "language_model.model.layers.39.self_attn.k_proj",
125
+ "language_model.model.layers.31.self_attn.k_proj",
126
+ "language_model.model.layers.50.self_attn.v_proj",
127
+ "language_model.model.layers.8.self_attn.q_proj",
128
+ "language_model.model.layers.29.self_attn.k_proj",
129
+ "language_model.model.layers.28.self_attn.q_proj",
130
+ "language_model.model.layers.37.self_attn.k_proj",
131
+ "language_model.model.layers.8.self_attn.k_proj",
132
+ "language_model.model.layers.60.self_attn.q_proj",
133
+ "language_model.model.layers.13.self_attn.v_proj",
134
+ "language_model.model.layers.18.self_attn.v_proj",
135
+ "language_model.model.layers.35.mlp.up_proj",
136
+ "language_model.model.layers.56.self_attn.k_proj",
137
+ "language_model.model.layers.27.self_attn.o_proj",
138
+ "language_model.model.layers.40.self_attn.q_proj",
139
+ "language_model.model.layers.25.mlp.down_proj",
140
+ "language_model.model.layers.50.self_attn.k_proj",
141
+ "language_model.model.layers.39.mlp.up_proj",
142
+ "language_model.model.layers.13.mlp.down_proj",
143
+ "language_model.model.layers.39.self_attn.o_proj",
144
+ "language_model.model.layers.56.mlp.down_proj",
145
+ "language_model.model.layers.13.self_attn.k_proj",
146
+ "language_model.model.layers.10.self_attn.k_proj",
147
+ "language_model.model.layers.42.self_attn.v_proj",
148
+ "language_model.model.layers.3.mlp.down_proj",
149
+ "language_model.model.layers.9.self_attn.q_proj",
150
+ "language_model.model.layers.36.mlp.up_proj",
151
+ "language_model.model.layers.29.mlp.down_proj",
152
+ "language_model.model.layers.51.mlp.down_proj",
153
+ "language_model.model.layers.36.self_attn.o_proj",
154
+ "language_model.model.layers.7.mlp.gate_proj",
155
+ "language_model.model.layers.31.mlp.gate_proj",
156
+ "language_model.model.layers.38.mlp.gate_proj",
157
+ "language_model.model.layers.55.mlp.down_proj",
158
+ "language_model.model.layers.30.mlp.down_proj",
159
+ "language_model.model.layers.54.mlp.gate_proj",
160
+ "language_model.model.layers.42.self_attn.q_proj",
161
+ "language_model.model.layers.0.mlp.down_proj",
162
+ "language_model.model.layers.32.self_attn.o_proj",
163
+ "language_model.model.layers.61.self_attn.q_proj",
164
+ "language_model.model.layers.55.self_attn.o_proj",
165
+ "language_model.model.layers.27.self_attn.q_proj",
166
+ "language_model.model.layers.41.mlp.up_proj",
167
+ "language_model.model.layers.2.mlp.down_proj",
168
+ "language_model.model.layers.48.self_attn.v_proj",
169
+ "language_model.model.layers.55.mlp.gate_proj",
170
+ "language_model.model.layers.5.self_attn.o_proj",
171
+ "language_model.model.layers.53.mlp.gate_proj",
172
+ "language_model.model.layers.26.mlp.gate_proj",
173
+ "language_model.model.layers.56.mlp.gate_proj",
174
+ "language_model.model.layers.53.mlp.up_proj",
175
+ "language_model.model.layers.16.self_attn.k_proj",
176
+ "language_model.model.layers.43.mlp.down_proj",
177
+ "language_model.model.layers.1.self_attn.k_proj",
178
+ "language_model.model.layers.19.self_attn.q_proj",
179
+ "language_model.model.layers.3.self_attn.k_proj",
180
+ "language_model.model.layers.21.self_attn.q_proj",
181
+ "language_model.model.layers.15.self_attn.o_proj",
182
+ "language_model.model.layers.57.self_attn.o_proj",
183
+ "language_model.model.layers.49.self_attn.o_proj",
184
+ "language_model.model.layers.50.self_attn.q_proj",
185
+ "language_model.model.layers.58.mlp.down_proj",
186
+ "language_model.model.layers.26.self_attn.k_proj",
187
+ "language_model.model.layers.38.self_attn.v_proj",
188
+ "language_model.model.layers.19.self_attn.v_proj",
189
+ "language_model.model.layers.19.mlp.up_proj",
190
+ "language_model.model.layers.3.self_attn.q_proj",
191
+ "language_model.model.layers.7.mlp.down_proj",
192
+ "language_model.model.layers.9.self_attn.k_proj",
193
+ "language_model.model.layers.29.mlp.up_proj",
194
+ "language_model.model.layers.49.self_attn.q_proj",
195
+ "language_model.model.layers.13.self_attn.q_proj",
196
+ "language_model.model.layers.59.self_attn.o_proj",
197
+ "language_model.model.layers.3.mlp.up_proj",
198
+ "language_model.model.layers.8.self_attn.v_proj",
199
+ "language_model.model.layers.0.self_attn.o_proj",
200
+ "language_model.model.layers.2.mlp.gate_proj",
201
+ "language_model.model.layers.16.self_attn.v_proj",
202
+ "language_model.model.layers.10.self_attn.v_proj",
203
+ "language_model.model.layers.16.mlp.down_proj",
204
+ "language_model.model.layers.20.mlp.gate_proj",
205
+ "language_model.model.layers.55.self_attn.v_proj",
206
+ "language_model.model.layers.49.self_attn.v_proj",
207
+ "language_model.model.layers.17.mlp.down_proj",
208
+ "language_model.model.layers.18.mlp.down_proj",
209
+ "language_model.model.layers.57.mlp.down_proj",
210
+ "language_model.model.layers.40.mlp.down_proj",
211
+ "language_model.model.layers.27.mlp.gate_proj",
212
+ "language_model.model.layers.17.self_attn.k_proj",
213
+ "language_model.model.layers.40.self_attn.o_proj",
214
+ "language_model.model.layers.12.mlp.gate_proj",
215
+ "language_model.model.layers.18.self_attn.q_proj",
216
+ "language_model.model.layers.54.self_attn.q_proj",
217
+ "language_model.model.layers.37.self_attn.o_proj",
218
+ "language_model.model.layers.20.self_attn.q_proj",
219
+ "language_model.model.layers.31.mlp.down_proj",
220
+ "language_model.model.layers.10.mlp.up_proj",
221
+ "language_model.model.layers.7.self_attn.k_proj",
222
+ "language_model.model.layers.1.self_attn.q_proj",
223
+ "language_model.model.layers.5.mlp.gate_proj",
224
+ "language_model.model.layers.61.mlp.down_proj",
225
+ "language_model.model.layers.46.self_attn.v_proj",
226
+ "language_model.model.layers.12.self_attn.v_proj",
227
+ "language_model.model.layers.54.self_attn.o_proj",
228
+ "language_model.model.layers.29.self_attn.o_proj",
229
+ "language_model.model.layers.61.self_attn.k_proj",
230
+ "language_model.model.layers.61.mlp.up_proj",
231
+ "language_model.model.layers.12.self_attn.o_proj",
232
+ "language_model.model.layers.5.mlp.up_proj",
233
+ "language_model.model.layers.54.mlp.down_proj",
234
+ "language_model.model.layers.53.self_attn.v_proj",
235
+ "language_model.model.layers.38.self_attn.k_proj",
236
+ "language_model.model.layers.42.self_attn.o_proj",
237
+ "language_model.model.layers.47.mlp.gate_proj",
238
+ "language_model.model.layers.25.mlp.gate_proj",
239
+ "language_model.model.layers.38.self_attn.q_proj",
240
+ "language_model.model.layers.11.self_attn.k_proj",
241
+ "language_model.model.layers.3.self_attn.v_proj",
242
+ "language_model.model.layers.61.self_attn.v_proj",
243
+ "language_model.model.layers.23.self_attn.k_proj",
244
+ "language_model.model.layers.16.self_attn.o_proj",
245
+ "language_model.model.layers.25.self_attn.q_proj",
246
+ "language_model.model.layers.37.mlp.up_proj",
247
+ "language_model.model.layers.13.mlp.gate_proj",
248
+ "language_model.model.layers.24.self_attn.o_proj",
249
+ "language_model.model.layers.35.self_attn.q_proj",
250
+ "language_model.model.layers.59.self_attn.q_proj",
251
+ "language_model.model.layers.17.self_attn.v_proj",
252
+ "language_model.model.layers.15.mlp.down_proj",
253
+ "language_model.model.layers.48.self_attn.q_proj",
254
+ "language_model.model.layers.61.self_attn.o_proj",
255
+ "language_model.model.layers.30.self_attn.k_proj",
256
+ "language_model.model.layers.21.mlp.up_proj",
257
+ "language_model.model.layers.44.mlp.down_proj",
258
+ "language_model.model.layers.12.self_attn.k_proj",
259
+ "language_model.model.layers.31.self_attn.q_proj",
260
+ "language_model.model.layers.31.self_attn.v_proj",
261
+ "language_model.model.layers.1.mlp.gate_proj",
262
+ "language_model.model.layers.22.self_attn.o_proj",
263
+ "language_model.model.layers.47.mlp.down_proj",
264
+ "language_model.model.layers.4.self_attn.k_proj",
265
+ "language_model.model.layers.25.self_attn.k_proj",
266
+ "language_model.model.layers.41.self_attn.k_proj",
267
+ "language_model.model.layers.33.self_attn.v_proj",
268
+ "language_model.model.layers.26.self_attn.q_proj",
269
+ "language_model.model.layers.9.mlp.down_proj",
270
+ "language_model.model.layers.45.self_attn.k_proj",
271
+ "language_model.model.layers.38.mlp.up_proj",
272
+ "language_model.model.layers.0.mlp.up_proj",
273
+ "language_model.model.layers.59.mlp.gate_proj",
274
+ "language_model.model.layers.5.self_attn.k_proj",
275
+ "language_model.model.layers.10.self_attn.o_proj",
276
+ "language_model.model.layers.60.mlp.up_proj",
277
+ "language_model.model.layers.26.self_attn.v_proj",
278
+ "language_model.model.layers.40.mlp.gate_proj",
279
+ "language_model.model.layers.60.self_attn.o_proj",
280
+ "language_model.model.layers.0.mlp.gate_proj",
281
+ "language_model.model.layers.39.mlp.down_proj",
282
+ "language_model.model.layers.28.self_attn.k_proj",
283
+ "language_model.model.layers.19.self_attn.o_proj",
284
+ "language_model.model.layers.43.self_attn.k_proj",
285
+ "language_model.model.layers.11.self_attn.q_proj",
286
+ "language_model.model.layers.41.mlp.gate_proj",
287
+ "language_model.model.layers.35.mlp.down_proj",
288
+ "language_model.model.layers.52.self_attn.o_proj",
289
+ "language_model.model.layers.32.self_attn.q_proj",
290
+ "language_model.model.layers.30.mlp.up_proj",
291
+ "language_model.model.layers.47.self_attn.q_proj",
292
+ "language_model.model.layers.21.mlp.down_proj",
293
+ "language_model.model.layers.24.mlp.gate_proj",
294
+ "language_model.model.layers.53.self_attn.o_proj",
295
+ "language_model.model.layers.42.mlp.down_proj",
296
+ "language_model.model.layers.44.self_attn.o_proj",
297
+ "language_model.model.layers.24.mlp.down_proj",
298
+ "language_model.model.layers.29.self_attn.q_proj",
299
+ "language_model.model.layers.49.mlp.up_proj",
300
+ "language_model.model.layers.4.self_attn.q_proj",
301
+ "language_model.model.layers.56.mlp.up_proj",
302
+ "language_model.model.layers.8.mlp.down_proj",
303
+ "language_model.model.layers.25.self_attn.v_proj",
304
+ "language_model.model.layers.37.self_attn.q_proj",
305
+ "language_model.model.layers.6.self_attn.k_proj",
306
+ "language_model.model.layers.14.self_attn.o_proj",
307
+ "language_model.model.layers.60.self_attn.k_proj",
308
+ "language_model.model.layers.2.mlp.up_proj",
309
+ "language_model.model.layers.34.mlp.gate_proj",
310
+ "language_model.model.layers.52.mlp.gate_proj",
311
+ "language_model.model.layers.6.mlp.down_proj",
312
+ "language_model.model.layers.45.self_attn.q_proj",
313
+ "language_model.model.layers.41.self_attn.q_proj",
314
+ "language_model.model.layers.52.self_attn.k_proj",
315
+ "language_model.model.layers.36.self_attn.v_proj",
316
+ "language_model.model.layers.28.mlp.down_proj",
317
+ "language_model.model.layers.15.self_attn.v_proj",
318
+ "language_model.model.layers.11.self_attn.o_proj",
319
+ "language_model.model.layers.29.mlp.gate_proj",
320
+ "language_model.model.layers.42.self_attn.k_proj",
321
+ "language_model.model.layers.52.mlp.up_proj",
322
+ "language_model.model.layers.22.self_attn.k_proj",
323
+ "language_model.model.layers.14.mlp.down_proj",
324
+ "language_model.model.layers.4.mlp.down_proj",
325
+ "language_model.model.layers.35.self_attn.k_proj",
326
+ "language_model.model.layers.52.self_attn.q_proj",
327
+ "language_model.model.layers.22.self_attn.q_proj",
328
+ "language_model.model.layers.58.mlp.gate_proj",
329
+ "language_model.model.layers.14.mlp.gate_proj",
330
+ "language_model.model.layers.47.self_attn.k_proj",
331
+ "language_model.model.layers.39.self_attn.q_proj",
332
+ "language_model.model.layers.42.mlp.up_proj",
333
+ "language_model.model.layers.34.mlp.down_proj",
334
+ "language_model.model.layers.30.self_attn.v_proj",
335
+ "language_model.model.layers.56.self_attn.o_proj",
336
+ "language_model.model.layers.25.self_attn.o_proj",
337
+ "language_model.model.layers.45.mlp.up_proj",
338
+ "language_model.model.layers.48.mlp.down_proj",
339
+ "language_model.model.layers.7.self_attn.o_proj",
340
+ "language_model.model.layers.18.self_attn.k_proj",
341
+ "language_model.model.layers.14.self_attn.v_proj",
342
+ "language_model.model.layers.40.self_attn.v_proj",
343
+ "language_model.model.layers.22.self_attn.v_proj",
344
+ "language_model.model.layers.7.self_attn.q_proj",
345
+ "language_model.model.layers.46.mlp.gate_proj",
346
+ "language_model.model.layers.56.self_attn.q_proj",
347
+ "language_model.model.layers.28.mlp.up_proj",
348
+ "language_model.model.layers.50.mlp.gate_proj",
349
+ "language_model.model.layers.23.self_attn.v_proj",
350
+ "language_model.model.layers.15.self_attn.q_proj",
351
+ "language_model.model.layers.9.mlp.gate_proj",
352
+ "language_model.model.layers.47.mlp.up_proj",
353
+ "language_model.model.layers.6.self_attn.o_proj",
354
+ "language_model.model.layers.12.self_attn.q_proj",
355
+ "language_model.model.layers.20.self_attn.k_proj",
356
+ "language_model.model.layers.51.mlp.up_proj",
357
+ "language_model.model.layers.58.self_attn.v_proj",
358
+ "language_model.model.layers.22.mlp.gate_proj",
359
+ "language_model.model.layers.14.mlp.up_proj",
360
+ "language_model.model.layers.33.self_attn.o_proj",
361
+ "language_model.model.layers.11.mlp.down_proj",
362
+ "language_model.model.layers.16.mlp.gate_proj",
363
+ "language_model.model.layers.40.mlp.up_proj",
364
+ "language_model.model.layers.60.self_attn.v_proj",
365
+ "language_model.model.layers.2.self_attn.k_proj",
366
+ "language_model.model.layers.26.self_attn.o_proj",
367
+ "language_model.model.layers.12.mlp.up_proj",
368
+ "language_model.model.layers.28.self_attn.o_proj",
369
+ "language_model.model.layers.32.mlp.up_proj",
370
+ "language_model.model.layers.45.self_attn.o_proj",
371
+ "language_model.model.layers.28.mlp.gate_proj",
372
+ "language_model.model.layers.11.mlp.gate_proj",
373
+ "language_model.model.layers.11.self_attn.v_proj",
374
+ "language_model.model.layers.2.self_attn.v_proj",
375
+ "language_model.model.layers.9.self_attn.v_proj",
376
+ "language_model.model.layers.19.self_attn.k_proj",
377
+ "language_model.model.layers.32.self_attn.k_proj",
378
+ "language_model.model.layers.43.self_attn.q_proj",
379
+ "language_model.model.layers.21.mlp.gate_proj",
380
+ "language_model.model.layers.45.self_attn.v_proj",
381
+ "language_model.model.layers.41.mlp.down_proj",
382
+ "language_model.model.layers.36.mlp.down_proj",
383
+ "language_model.model.layers.53.self_attn.k_proj",
384
+ "language_model.model.layers.16.self_attn.q_proj",
385
+ "language_model.model.layers.3.mlp.gate_proj",
386
+ "language_model.model.layers.15.self_attn.k_proj",
387
+ "language_model.model.layers.33.mlp.gate_proj",
388
+ "language_model.model.layers.48.mlp.up_proj",
389
+ "language_model.model.layers.33.self_attn.q_proj",
390
+ "language_model.model.layers.54.self_attn.k_proj",
391
+ "language_model.model.layers.7.self_attn.v_proj",
392
+ "language_model.model.layers.5.mlp.down_proj",
393
+ "language_model.model.layers.50.self_attn.o_proj",
394
+ "language_model.model.layers.21.self_attn.o_proj",
395
+ "language_model.model.layers.2.self_attn.q_proj",
396
+ "language_model.model.layers.27.self_attn.k_proj",
397
+ "language_model.model.layers.46.self_attn.q_proj",
398
+ "language_model.model.layers.20.self_attn.v_proj",
399
+ "language_model.model.layers.34.self_attn.k_proj",
400
+ "language_model.model.layers.18.mlp.gate_proj",
401
+ "language_model.model.layers.35.self_attn.v_proj",
402
+ "language_model.model.layers.32.mlp.down_proj",
403
+ "language_model.model.layers.6.self_attn.q_proj",
404
+ "language_model.model.layers.20.mlp.down_proj",
405
+ "language_model.model.layers.27.mlp.up_proj",
406
+ "language_model.model.layers.31.self_attn.o_proj",
407
+ "language_model.model.layers.59.mlp.down_proj",
408
+ "language_model.model.layers.4.self_attn.o_proj",
409
+ "language_model.model.layers.15.mlp.gate_proj",
410
+ "language_model.model.layers.44.self_attn.q_proj",
411
+ "language_model.model.layers.31.mlp.up_proj",
412
+ "language_model.model.layers.30.mlp.gate_proj",
413
+ "language_model.model.layers.42.mlp.gate_proj",
414
+ "language_model.model.layers.19.mlp.gate_proj",
415
+ "language_model.model.layers.38.mlp.down_proj",
416
+ "language_model.model.layers.23.self_attn.o_proj",
417
+ "language_model.model.layers.16.mlp.up_proj",
418
+ "language_model.model.layers.52.self_attn.v_proj",
419
+ "language_model.model.layers.22.mlp.up_proj",
420
+ "language_model.model.layers.41.self_attn.o_proj",
421
+ "language_model.model.layers.19.mlp.down_proj",
422
+ "language_model.model.layers.37.mlp.down_proj",
423
+ "language_model.model.layers.8.self_attn.o_proj",
424
+ "language_model.model.layers.57.self_attn.k_proj",
425
+ "language_model.model.layers.41.self_attn.v_proj",
426
+ "language_model.model.layers.21.self_attn.v_proj",
427
+ "language_model.model.layers.34.self_attn.q_proj",
428
+ "language_model.model.layers.47.self_attn.v_proj",
429
+ "language_model.model.layers.32.mlp.gate_proj",
430
+ "language_model.model.layers.43.self_attn.o_proj",
431
+ "language_model.model.layers.21.self_attn.k_proj",
432
+ "language_model.model.layers.1.self_attn.v_proj",
433
+ "language_model.model.layers.8.mlp.up_proj",
434
+ "language_model.model.layers.20.mlp.up_proj",
435
+ "language_model.model.layers.23.mlp.gate_proj",
436
+ "language_model.model.layers.43.mlp.gate_proj",
437
+ "language_model.model.layers.54.self_attn.v_proj",
438
+ "language_model.model.layers.54.mlp.up_proj",
439
+ "language_model.model.layers.50.mlp.up_proj",
440
+ "language_model.model.layers.48.mlp.gate_proj",
441
+ "language_model.model.layers.35.mlp.gate_proj",
442
+ "language_model.model.layers.0.self_attn.k_proj",
443
+ "language_model.model.layers.47.self_attn.o_proj",
444
+ "language_model.model.layers.51.self_attn.k_proj",
445
+ "language_model.model.layers.58.self_attn.k_proj",
446
+ "language_model.model.layers.35.self_attn.o_proj",
447
+ "language_model.model.layers.55.self_attn.q_proj",
448
+ "language_model.model.layers.57.mlp.gate_proj",
449
+ "language_model.model.layers.6.mlp.up_proj",
450
+ "language_model.model.layers.7.mlp.up_proj",
451
+ "language_model.model.layers.34.self_attn.v_proj",
452
+ "language_model.model.layers.48.self_attn.o_proj",
453
+ "language_model.model.layers.36.self_attn.q_proj",
454
+ "language_model.model.layers.6.self_attn.v_proj",
455
+ "language_model.model.layers.53.mlp.down_proj",
456
+ "language_model.model.layers.49.mlp.gate_proj"
457
+ ],
458
+ "task_type": "CAUSAL_LM",
459
+ "use_dora": false,
460
+ "use_rslora": false
461
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2858bf7ea1a8dfcfd2e9c7c507b264f099f9dbe08796a8fc21788c499f6d5ad8
3
+ size 1816396048
added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<image_soft_token>": 262144
3
+ }
all_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 8.0,
3
+ "num_input_tokens_seen": 1130688,
4
+ "total_flos": 1.7959027775641805e+17,
5
+ "train_loss": 2.86213872662629,
6
+ "train_runtime": 778.6794,
7
+ "train_samples_per_second": 1.233,
8
+ "train_steps_per_second": 0.154
9
+ }
chat_template.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "chat_template": "{{ bos_token }}\n{%- if messages[0]['role'] == 'system' -%}\n {%- if messages[0]['content'] is string -%}\n {%- set first_user_prefix = messages[0]['content'] + '\n\n' -%}\n {%- else -%}\n {%- set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' -%}\n {%- endif -%}\n {%- set loop_messages = messages[1:] -%}\n{%- else -%}\n {%- set first_user_prefix = \"\" -%}\n {%- set loop_messages = messages -%}\n{%- endif -%}\n{%- for message in loop_messages -%}\n {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}\n {{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}\n {%- endif -%}\n {%- if (message['role'] == 'assistant') -%}\n {%- set role = \"model\" -%}\n {%- else -%}\n {%- set role = message['role'] -%}\n {%- endif -%}\n {{ '<start_of_turn>' + role + '\n' + (first_user_prefix if loop.first else \"\") }}\n {%- if message['content'] is string -%}\n {{ message['content'] | trim }}\n {%- elif message['content'] is iterable -%}\n {%- for item in message['content'] -%}\n {%- if item['type'] == 'image' -%}\n {{ '<start_of_image>' }}\n {%- elif item['type'] == 'text' -%}\n {{ item['text'] | trim }}\n {%- endif -%}\n {%- endfor -%}\n {%- else -%}\n {{ raise_exception(\"Invalid content type\") }}\n {%- endif -%}\n {{ '<end_of_turn>\n' }}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n {{'<start_of_turn>model\n'}}\n{%- endif -%}\n"
3
+ }
checkpoint-100/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: google/gemma-3-27b-it
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.12.0
checkpoint-100/adapter_config.json ADDED
@@ -0,0 +1,461 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "google/gemma-3-27b-it",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 128,
14
+ "lora_dropout": 0,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 64,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "language_model.model.layers.18.mlp.up_proj",
24
+ "language_model.model.layers.14.self_attn.q_proj",
25
+ "language_model.model.layers.23.mlp.down_proj",
26
+ "language_model.model.layers.5.self_attn.q_proj",
27
+ "language_model.model.layers.48.self_attn.k_proj",
28
+ "language_model.model.layers.29.self_attn.v_proj",
29
+ "language_model.model.layers.39.self_attn.v_proj",
30
+ "language_model.model.layers.5.self_attn.v_proj",
31
+ "language_model.model.layers.60.mlp.gate_proj",
32
+ "language_model.model.layers.17.self_attn.o_proj",
33
+ "language_model.model.layers.60.mlp.down_proj",
34
+ "language_model.model.layers.33.mlp.up_proj",
35
+ "language_model.model.layers.53.self_attn.q_proj",
36
+ "language_model.model.layers.27.mlp.down_proj",
37
+ "language_model.model.layers.57.self_attn.v_proj",
38
+ "language_model.model.layers.26.mlp.up_proj",
39
+ "language_model.model.layers.10.mlp.gate_proj",
40
+ "language_model.model.layers.23.mlp.up_proj",
41
+ "language_model.model.layers.57.self_attn.q_proj",
42
+ "language_model.model.layers.57.mlp.up_proj",
43
+ "language_model.model.layers.39.mlp.gate_proj",
44
+ "language_model.model.layers.26.mlp.down_proj",
45
+ "language_model.model.layers.11.mlp.up_proj",
46
+ "language_model.model.layers.25.mlp.up_proj",
47
+ "language_model.model.layers.6.mlp.gate_proj",
48
+ "language_model.model.layers.36.mlp.gate_proj",
49
+ "language_model.model.layers.2.self_attn.o_proj",
50
+ "language_model.model.layers.30.self_attn.o_proj",
51
+ "language_model.model.layers.4.mlp.gate_proj",
52
+ "language_model.model.layers.13.mlp.up_proj",
53
+ "language_model.model.layers.52.mlp.down_proj",
54
+ "language_model.model.layers.4.mlp.up_proj",
55
+ "language_model.model.layers.32.self_attn.v_proj",
56
+ "language_model.model.layers.1.self_attn.o_proj",
57
+ "language_model.model.layers.22.mlp.down_proj",
58
+ "language_model.model.layers.45.mlp.down_proj",
59
+ "language_model.model.layers.46.self_attn.o_proj",
60
+ "language_model.model.layers.38.self_attn.o_proj",
61
+ "language_model.model.layers.23.self_attn.q_proj",
62
+ "language_model.model.layers.46.mlp.up_proj",
63
+ "language_model.model.layers.49.mlp.down_proj",
64
+ "language_model.model.layers.40.self_attn.k_proj",
65
+ "language_model.model.layers.24.self_attn.q_proj",
66
+ "language_model.model.layers.9.self_attn.o_proj",
67
+ "language_model.model.layers.37.mlp.gate_proj",
68
+ "language_model.model.layers.36.self_attn.k_proj",
69
+ "language_model.model.layers.49.self_attn.k_proj",
70
+ "language_model.model.layers.24.self_attn.v_proj",
71
+ "language_model.model.layers.1.mlp.down_proj",
72
+ "language_model.model.layers.10.mlp.down_proj",
73
+ "language_model.model.layers.61.mlp.gate_proj",
74
+ "language_model.model.layers.34.self_attn.o_proj",
75
+ "language_model.model.layers.9.mlp.up_proj",
76
+ "language_model.model.layers.59.self_attn.v_proj",
77
+ "language_model.model.layers.30.self_attn.q_proj",
78
+ "language_model.model.layers.44.mlp.gate_proj",
79
+ "language_model.model.layers.46.mlp.down_proj",
80
+ "language_model.model.layers.46.self_attn.k_proj",
81
+ "language_model.model.layers.28.self_attn.v_proj",
82
+ "language_model.model.layers.58.self_attn.o_proj",
83
+ "language_model.model.layers.13.self_attn.o_proj",
84
+ "language_model.model.layers.24.mlp.up_proj",
85
+ "language_model.model.layers.0.self_attn.q_proj",
86
+ "language_model.model.layers.58.mlp.up_proj",
87
+ "language_model.model.layers.12.mlp.down_proj",
88
+ "language_model.model.layers.43.mlp.up_proj",
89
+ "language_model.model.layers.44.self_attn.k_proj",
90
+ "language_model.model.layers.17.self_attn.q_proj",
91
+ "language_model.model.layers.59.mlp.up_proj",
92
+ "language_model.model.layers.50.mlp.down_proj",
93
+ "language_model.model.layers.24.self_attn.k_proj",
94
+ "language_model.model.layers.59.self_attn.k_proj",
95
+ "language_model.model.layers.1.mlp.up_proj",
96
+ "language_model.model.layers.58.self_attn.q_proj",
97
+ "language_model.model.layers.51.mlp.gate_proj",
98
+ "language_model.model.layers.4.self_attn.v_proj",
99
+ "language_model.model.layers.45.mlp.gate_proj",
100
+ "language_model.model.layers.8.mlp.gate_proj",
101
+ "language_model.model.layers.43.self_attn.v_proj",
102
+ "language_model.model.layers.51.self_attn.v_proj",
103
+ "language_model.model.layers.37.self_attn.v_proj",
104
+ "language_model.model.layers.51.self_attn.q_proj",
105
+ "language_model.model.layers.51.self_attn.o_proj",
106
+ "language_model.model.layers.33.mlp.down_proj",
107
+ "language_model.model.layers.44.self_attn.v_proj",
108
+ "language_model.model.layers.18.self_attn.o_proj",
109
+ "language_model.model.layers.14.self_attn.k_proj",
110
+ "language_model.model.layers.55.self_attn.k_proj",
111
+ "language_model.model.layers.10.self_attn.q_proj",
112
+ "language_model.model.layers.34.mlp.up_proj",
113
+ "language_model.model.layers.27.self_attn.v_proj",
114
+ "language_model.model.layers.20.self_attn.o_proj",
115
+ "language_model.model.layers.33.self_attn.k_proj",
116
+ "language_model.model.layers.44.mlp.up_proj",
117
+ "language_model.model.layers.3.self_attn.o_proj",
118
+ "language_model.model.layers.15.mlp.up_proj",
119
+ "language_model.model.layers.17.mlp.gate_proj",
120
+ "language_model.model.layers.17.mlp.up_proj",
121
+ "language_model.model.layers.55.mlp.up_proj",
122
+ "language_model.model.layers.0.self_attn.v_proj",
123
+ "language_model.model.layers.56.self_attn.v_proj",
124
+ "language_model.model.layers.39.self_attn.k_proj",
125
+ "language_model.model.layers.31.self_attn.k_proj",
126
+ "language_model.model.layers.50.self_attn.v_proj",
127
+ "language_model.model.layers.8.self_attn.q_proj",
128
+ "language_model.model.layers.29.self_attn.k_proj",
129
+ "language_model.model.layers.28.self_attn.q_proj",
130
+ "language_model.model.layers.37.self_attn.k_proj",
131
+ "language_model.model.layers.8.self_attn.k_proj",
132
+ "language_model.model.layers.60.self_attn.q_proj",
133
+ "language_model.model.layers.13.self_attn.v_proj",
134
+ "language_model.model.layers.18.self_attn.v_proj",
135
+ "language_model.model.layers.35.mlp.up_proj",
136
+ "language_model.model.layers.56.self_attn.k_proj",
137
+ "language_model.model.layers.27.self_attn.o_proj",
138
+ "language_model.model.layers.40.self_attn.q_proj",
139
+ "language_model.model.layers.25.mlp.down_proj",
140
+ "language_model.model.layers.50.self_attn.k_proj",
141
+ "language_model.model.layers.39.mlp.up_proj",
142
+ "language_model.model.layers.13.mlp.down_proj",
143
+ "language_model.model.layers.39.self_attn.o_proj",
144
+ "language_model.model.layers.56.mlp.down_proj",
145
+ "language_model.model.layers.13.self_attn.k_proj",
146
+ "language_model.model.layers.10.self_attn.k_proj",
147
+ "language_model.model.layers.42.self_attn.v_proj",
148
+ "language_model.model.layers.3.mlp.down_proj",
149
+ "language_model.model.layers.9.self_attn.q_proj",
150
+ "language_model.model.layers.36.mlp.up_proj",
151
+ "language_model.model.layers.29.mlp.down_proj",
152
+ "language_model.model.layers.51.mlp.down_proj",
153
+ "language_model.model.layers.36.self_attn.o_proj",
154
+ "language_model.model.layers.7.mlp.gate_proj",
155
+ "language_model.model.layers.31.mlp.gate_proj",
156
+ "language_model.model.layers.38.mlp.gate_proj",
157
+ "language_model.model.layers.55.mlp.down_proj",
158
+ "language_model.model.layers.30.mlp.down_proj",
159
+ "language_model.model.layers.54.mlp.gate_proj",
160
+ "language_model.model.layers.42.self_attn.q_proj",
161
+ "language_model.model.layers.0.mlp.down_proj",
162
+ "language_model.model.layers.32.self_attn.o_proj",
163
+ "language_model.model.layers.61.self_attn.q_proj",
164
+ "language_model.model.layers.55.self_attn.o_proj",
165
+ "language_model.model.layers.27.self_attn.q_proj",
166
+ "language_model.model.layers.41.mlp.up_proj",
167
+ "language_model.model.layers.2.mlp.down_proj",
168
+ "language_model.model.layers.48.self_attn.v_proj",
169
+ "language_model.model.layers.55.mlp.gate_proj",
170
+ "language_model.model.layers.5.self_attn.o_proj",
171
+ "language_model.model.layers.53.mlp.gate_proj",
172
+ "language_model.model.layers.26.mlp.gate_proj",
173
+ "language_model.model.layers.56.mlp.gate_proj",
174
+ "language_model.model.layers.53.mlp.up_proj",
175
+ "language_model.model.layers.16.self_attn.k_proj",
176
+ "language_model.model.layers.43.mlp.down_proj",
177
+ "language_model.model.layers.1.self_attn.k_proj",
178
+ "language_model.model.layers.19.self_attn.q_proj",
179
+ "language_model.model.layers.3.self_attn.k_proj",
180
+ "language_model.model.layers.21.self_attn.q_proj",
181
+ "language_model.model.layers.15.self_attn.o_proj",
182
+ "language_model.model.layers.57.self_attn.o_proj",
183
+ "language_model.model.layers.49.self_attn.o_proj",
184
+ "language_model.model.layers.50.self_attn.q_proj",
185
+ "language_model.model.layers.58.mlp.down_proj",
186
+ "language_model.model.layers.26.self_attn.k_proj",
187
+ "language_model.model.layers.38.self_attn.v_proj",
188
+ "language_model.model.layers.19.self_attn.v_proj",
189
+ "language_model.model.layers.19.mlp.up_proj",
190
+ "language_model.model.layers.3.self_attn.q_proj",
191
+ "language_model.model.layers.7.mlp.down_proj",
192
+ "language_model.model.layers.9.self_attn.k_proj",
193
+ "language_model.model.layers.29.mlp.up_proj",
194
+ "language_model.model.layers.49.self_attn.q_proj",
195
+ "language_model.model.layers.13.self_attn.q_proj",
196
+ "language_model.model.layers.59.self_attn.o_proj",
197
+ "language_model.model.layers.3.mlp.up_proj",
198
+ "language_model.model.layers.8.self_attn.v_proj",
199
+ "language_model.model.layers.0.self_attn.o_proj",
200
+ "language_model.model.layers.2.mlp.gate_proj",
201
+ "language_model.model.layers.16.self_attn.v_proj",
202
+ "language_model.model.layers.10.self_attn.v_proj",
203
+ "language_model.model.layers.16.mlp.down_proj",
204
+ "language_model.model.layers.20.mlp.gate_proj",
205
+ "language_model.model.layers.55.self_attn.v_proj",
206
+ "language_model.model.layers.49.self_attn.v_proj",
207
+ "language_model.model.layers.17.mlp.down_proj",
208
+ "language_model.model.layers.18.mlp.down_proj",
209
+ "language_model.model.layers.57.mlp.down_proj",
210
+ "language_model.model.layers.40.mlp.down_proj",
211
+ "language_model.model.layers.27.mlp.gate_proj",
212
+ "language_model.model.layers.17.self_attn.k_proj",
213
+ "language_model.model.layers.40.self_attn.o_proj",
214
+ "language_model.model.layers.12.mlp.gate_proj",
215
+ "language_model.model.layers.18.self_attn.q_proj",
216
+ "language_model.model.layers.54.self_attn.q_proj",
217
+ "language_model.model.layers.37.self_attn.o_proj",
218
+ "language_model.model.layers.20.self_attn.q_proj",
219
+ "language_model.model.layers.31.mlp.down_proj",
220
+ "language_model.model.layers.10.mlp.up_proj",
221
+ "language_model.model.layers.7.self_attn.k_proj",
222
+ "language_model.model.layers.1.self_attn.q_proj",
223
+ "language_model.model.layers.5.mlp.gate_proj",
224
+ "language_model.model.layers.61.mlp.down_proj",
225
+ "language_model.model.layers.46.self_attn.v_proj",
226
+ "language_model.model.layers.12.self_attn.v_proj",
227
+ "language_model.model.layers.54.self_attn.o_proj",
228
+ "language_model.model.layers.29.self_attn.o_proj",
229
+ "language_model.model.layers.61.self_attn.k_proj",
230
+ "language_model.model.layers.61.mlp.up_proj",
231
+ "language_model.model.layers.12.self_attn.o_proj",
232
+ "language_model.model.layers.5.mlp.up_proj",
233
+ "language_model.model.layers.54.mlp.down_proj",
234
+ "language_model.model.layers.53.self_attn.v_proj",
235
+ "language_model.model.layers.38.self_attn.k_proj",
236
+ "language_model.model.layers.42.self_attn.o_proj",
237
+ "language_model.model.layers.47.mlp.gate_proj",
238
+ "language_model.model.layers.25.mlp.gate_proj",
239
+ "language_model.model.layers.38.self_attn.q_proj",
240
+ "language_model.model.layers.11.self_attn.k_proj",
241
+ "language_model.model.layers.3.self_attn.v_proj",
242
+ "language_model.model.layers.61.self_attn.v_proj",
243
+ "language_model.model.layers.23.self_attn.k_proj",
244
+ "language_model.model.layers.16.self_attn.o_proj",
245
+ "language_model.model.layers.25.self_attn.q_proj",
246
+ "language_model.model.layers.37.mlp.up_proj",
247
+ "language_model.model.layers.13.mlp.gate_proj",
248
+ "language_model.model.layers.24.self_attn.o_proj",
249
+ "language_model.model.layers.35.self_attn.q_proj",
250
+ "language_model.model.layers.59.self_attn.q_proj",
251
+ "language_model.model.layers.17.self_attn.v_proj",
252
+ "language_model.model.layers.15.mlp.down_proj",
253
+ "language_model.model.layers.48.self_attn.q_proj",
254
+ "language_model.model.layers.61.self_attn.o_proj",
255
+ "language_model.model.layers.30.self_attn.k_proj",
256
+ "language_model.model.layers.21.mlp.up_proj",
257
+ "language_model.model.layers.44.mlp.down_proj",
258
+ "language_model.model.layers.12.self_attn.k_proj",
259
+ "language_model.model.layers.31.self_attn.q_proj",
260
+ "language_model.model.layers.31.self_attn.v_proj",
261
+ "language_model.model.layers.1.mlp.gate_proj",
262
+ "language_model.model.layers.22.self_attn.o_proj",
263
+ "language_model.model.layers.47.mlp.down_proj",
264
+ "language_model.model.layers.4.self_attn.k_proj",
265
+ "language_model.model.layers.25.self_attn.k_proj",
266
+ "language_model.model.layers.41.self_attn.k_proj",
267
+ "language_model.model.layers.33.self_attn.v_proj",
268
+ "language_model.model.layers.26.self_attn.q_proj",
269
+ "language_model.model.layers.9.mlp.down_proj",
270
+ "language_model.model.layers.45.self_attn.k_proj",
271
+ "language_model.model.layers.38.mlp.up_proj",
272
+ "language_model.model.layers.0.mlp.up_proj",
273
+ "language_model.model.layers.59.mlp.gate_proj",
274
+ "language_model.model.layers.5.self_attn.k_proj",
275
+ "language_model.model.layers.10.self_attn.o_proj",
276
+ "language_model.model.layers.60.mlp.up_proj",
277
+ "language_model.model.layers.26.self_attn.v_proj",
278
+ "language_model.model.layers.40.mlp.gate_proj",
279
+ "language_model.model.layers.60.self_attn.o_proj",
280
+ "language_model.model.layers.0.mlp.gate_proj",
281
+ "language_model.model.layers.39.mlp.down_proj",
282
+ "language_model.model.layers.28.self_attn.k_proj",
283
+ "language_model.model.layers.19.self_attn.o_proj",
284
+ "language_model.model.layers.43.self_attn.k_proj",
285
+ "language_model.model.layers.11.self_attn.q_proj",
286
+ "language_model.model.layers.41.mlp.gate_proj",
287
+ "language_model.model.layers.35.mlp.down_proj",
288
+ "language_model.model.layers.52.self_attn.o_proj",
289
+ "language_model.model.layers.32.self_attn.q_proj",
290
+ "language_model.model.layers.30.mlp.up_proj",
291
+ "language_model.model.layers.47.self_attn.q_proj",
292
+ "language_model.model.layers.21.mlp.down_proj",
293
+ "language_model.model.layers.24.mlp.gate_proj",
294
+ "language_model.model.layers.53.self_attn.o_proj",
295
+ "language_model.model.layers.42.mlp.down_proj",
296
+ "language_model.model.layers.44.self_attn.o_proj",
297
+ "language_model.model.layers.24.mlp.down_proj",
298
+ "language_model.model.layers.29.self_attn.q_proj",
299
+ "language_model.model.layers.49.mlp.up_proj",
300
+ "language_model.model.layers.4.self_attn.q_proj",
301
+ "language_model.model.layers.56.mlp.up_proj",
302
+ "language_model.model.layers.8.mlp.down_proj",
303
+ "language_model.model.layers.25.self_attn.v_proj",
304
+ "language_model.model.layers.37.self_attn.q_proj",
305
+ "language_model.model.layers.6.self_attn.k_proj",
306
+ "language_model.model.layers.14.self_attn.o_proj",
307
+ "language_model.model.layers.60.self_attn.k_proj",
308
+ "language_model.model.layers.2.mlp.up_proj",
309
+ "language_model.model.layers.34.mlp.gate_proj",
310
+ "language_model.model.layers.52.mlp.gate_proj",
311
+ "language_model.model.layers.6.mlp.down_proj",
312
+ "language_model.model.layers.45.self_attn.q_proj",
313
+ "language_model.model.layers.41.self_attn.q_proj",
314
+ "language_model.model.layers.52.self_attn.k_proj",
315
+ "language_model.model.layers.36.self_attn.v_proj",
316
+ "language_model.model.layers.28.mlp.down_proj",
317
+ "language_model.model.layers.15.self_attn.v_proj",
318
+ "language_model.model.layers.11.self_attn.o_proj",
319
+ "language_model.model.layers.29.mlp.gate_proj",
320
+ "language_model.model.layers.42.self_attn.k_proj",
321
+ "language_model.model.layers.52.mlp.up_proj",
322
+ "language_model.model.layers.22.self_attn.k_proj",
323
+ "language_model.model.layers.14.mlp.down_proj",
324
+ "language_model.model.layers.4.mlp.down_proj",
325
+ "language_model.model.layers.35.self_attn.k_proj",
326
+ "language_model.model.layers.52.self_attn.q_proj",
327
+ "language_model.model.layers.22.self_attn.q_proj",
328
+ "language_model.model.layers.58.mlp.gate_proj",
329
+ "language_model.model.layers.14.mlp.gate_proj",
330
+ "language_model.model.layers.47.self_attn.k_proj",
331
+ "language_model.model.layers.39.self_attn.q_proj",
332
+ "language_model.model.layers.42.mlp.up_proj",
333
+ "language_model.model.layers.34.mlp.down_proj",
334
+ "language_model.model.layers.30.self_attn.v_proj",
335
+ "language_model.model.layers.56.self_attn.o_proj",
336
+ "language_model.model.layers.25.self_attn.o_proj",
337
+ "language_model.model.layers.45.mlp.up_proj",
338
+ "language_model.model.layers.48.mlp.down_proj",
339
+ "language_model.model.layers.7.self_attn.o_proj",
340
+ "language_model.model.layers.18.self_attn.k_proj",
341
+ "language_model.model.layers.14.self_attn.v_proj",
342
+ "language_model.model.layers.40.self_attn.v_proj",
343
+ "language_model.model.layers.22.self_attn.v_proj",
344
+ "language_model.model.layers.7.self_attn.q_proj",
345
+ "language_model.model.layers.46.mlp.gate_proj",
346
+ "language_model.model.layers.56.self_attn.q_proj",
347
+ "language_model.model.layers.28.mlp.up_proj",
348
+ "language_model.model.layers.50.mlp.gate_proj",
349
+ "language_model.model.layers.23.self_attn.v_proj",
350
+ "language_model.model.layers.15.self_attn.q_proj",
351
+ "language_model.model.layers.9.mlp.gate_proj",
352
+ "language_model.model.layers.47.mlp.up_proj",
353
+ "language_model.model.layers.6.self_attn.o_proj",
354
+ "language_model.model.layers.12.self_attn.q_proj",
355
+ "language_model.model.layers.20.self_attn.k_proj",
356
+ "language_model.model.layers.51.mlp.up_proj",
357
+ "language_model.model.layers.58.self_attn.v_proj",
358
+ "language_model.model.layers.22.mlp.gate_proj",
359
+ "language_model.model.layers.14.mlp.up_proj",
360
+ "language_model.model.layers.33.self_attn.o_proj",
361
+ "language_model.model.layers.11.mlp.down_proj",
362
+ "language_model.model.layers.16.mlp.gate_proj",
363
+ "language_model.model.layers.40.mlp.up_proj",
364
+ "language_model.model.layers.60.self_attn.v_proj",
365
+ "language_model.model.layers.2.self_attn.k_proj",
366
+ "language_model.model.layers.26.self_attn.o_proj",
367
+ "language_model.model.layers.12.mlp.up_proj",
368
+ "language_model.model.layers.28.self_attn.o_proj",
369
+ "language_model.model.layers.32.mlp.up_proj",
370
+ "language_model.model.layers.45.self_attn.o_proj",
371
+ "language_model.model.layers.28.mlp.gate_proj",
372
+ "language_model.model.layers.11.mlp.gate_proj",
373
+ "language_model.model.layers.11.self_attn.v_proj",
374
+ "language_model.model.layers.2.self_attn.v_proj",
375
+ "language_model.model.layers.9.self_attn.v_proj",
376
+ "language_model.model.layers.19.self_attn.k_proj",
377
+ "language_model.model.layers.32.self_attn.k_proj",
378
+ "language_model.model.layers.43.self_attn.q_proj",
379
+ "language_model.model.layers.21.mlp.gate_proj",
380
+ "language_model.model.layers.45.self_attn.v_proj",
381
+ "language_model.model.layers.41.mlp.down_proj",
382
+ "language_model.model.layers.36.mlp.down_proj",
383
+ "language_model.model.layers.53.self_attn.k_proj",
384
+ "language_model.model.layers.16.self_attn.q_proj",
385
+ "language_model.model.layers.3.mlp.gate_proj",
386
+ "language_model.model.layers.15.self_attn.k_proj",
387
+ "language_model.model.layers.33.mlp.gate_proj",
388
+ "language_model.model.layers.48.mlp.up_proj",
389
+ "language_model.model.layers.33.self_attn.q_proj",
390
+ "language_model.model.layers.54.self_attn.k_proj",
391
+ "language_model.model.layers.7.self_attn.v_proj",
392
+ "language_model.model.layers.5.mlp.down_proj",
393
+ "language_model.model.layers.50.self_attn.o_proj",
394
+ "language_model.model.layers.21.self_attn.o_proj",
395
+ "language_model.model.layers.2.self_attn.q_proj",
396
+ "language_model.model.layers.27.self_attn.k_proj",
397
+ "language_model.model.layers.46.self_attn.q_proj",
398
+ "language_model.model.layers.20.self_attn.v_proj",
399
+ "language_model.model.layers.34.self_attn.k_proj",
400
+ "language_model.model.layers.18.mlp.gate_proj",
401
+ "language_model.model.layers.35.self_attn.v_proj",
402
+ "language_model.model.layers.32.mlp.down_proj",
403
+ "language_model.model.layers.6.self_attn.q_proj",
404
+ "language_model.model.layers.20.mlp.down_proj",
405
+ "language_model.model.layers.27.mlp.up_proj",
406
+ "language_model.model.layers.31.self_attn.o_proj",
407
+ "language_model.model.layers.59.mlp.down_proj",
408
+ "language_model.model.layers.4.self_attn.o_proj",
409
+ "language_model.model.layers.15.mlp.gate_proj",
410
+ "language_model.model.layers.44.self_attn.q_proj",
411
+ "language_model.model.layers.31.mlp.up_proj",
412
+ "language_model.model.layers.30.mlp.gate_proj",
413
+ "language_model.model.layers.42.mlp.gate_proj",
414
+ "language_model.model.layers.19.mlp.gate_proj",
415
+ "language_model.model.layers.38.mlp.down_proj",
416
+ "language_model.model.layers.23.self_attn.o_proj",
417
+ "language_model.model.layers.16.mlp.up_proj",
418
+ "language_model.model.layers.52.self_attn.v_proj",
419
+ "language_model.model.layers.22.mlp.up_proj",
420
+ "language_model.model.layers.41.self_attn.o_proj",
421
+ "language_model.model.layers.19.mlp.down_proj",
422
+ "language_model.model.layers.37.mlp.down_proj",
423
+ "language_model.model.layers.8.self_attn.o_proj",
424
+ "language_model.model.layers.57.self_attn.k_proj",
425
+ "language_model.model.layers.41.self_attn.v_proj",
426
+ "language_model.model.layers.21.self_attn.v_proj",
427
+ "language_model.model.layers.34.self_attn.q_proj",
428
+ "language_model.model.layers.47.self_attn.v_proj",
429
+ "language_model.model.layers.32.mlp.gate_proj",
430
+ "language_model.model.layers.43.self_attn.o_proj",
431
+ "language_model.model.layers.21.self_attn.k_proj",
432
+ "language_model.model.layers.1.self_attn.v_proj",
433
+ "language_model.model.layers.8.mlp.up_proj",
434
+ "language_model.model.layers.20.mlp.up_proj",
435
+ "language_model.model.layers.23.mlp.gate_proj",
436
+ "language_model.model.layers.43.mlp.gate_proj",
437
+ "language_model.model.layers.54.self_attn.v_proj",
438
+ "language_model.model.layers.54.mlp.up_proj",
439
+ "language_model.model.layers.50.mlp.up_proj",
440
+ "language_model.model.layers.48.mlp.gate_proj",
441
+ "language_model.model.layers.35.mlp.gate_proj",
442
+ "language_model.model.layers.0.self_attn.k_proj",
443
+ "language_model.model.layers.47.self_attn.o_proj",
444
+ "language_model.model.layers.51.self_attn.k_proj",
445
+ "language_model.model.layers.58.self_attn.k_proj",
446
+ "language_model.model.layers.35.self_attn.o_proj",
447
+ "language_model.model.layers.55.self_attn.q_proj",
448
+ "language_model.model.layers.57.mlp.gate_proj",
449
+ "language_model.model.layers.6.mlp.up_proj",
450
+ "language_model.model.layers.7.mlp.up_proj",
451
+ "language_model.model.layers.34.self_attn.v_proj",
452
+ "language_model.model.layers.48.self_attn.o_proj",
453
+ "language_model.model.layers.36.self_attn.q_proj",
454
+ "language_model.model.layers.6.self_attn.v_proj",
455
+ "language_model.model.layers.53.mlp.down_proj",
456
+ "language_model.model.layers.49.mlp.gate_proj"
457
+ ],
458
+ "task_type": "CAUSAL_LM",
459
+ "use_dora": false,
460
+ "use_rslora": false
461
+ }
checkpoint-100/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c86fba727c9adcec1269557790dda34620d02c7f4ab2af869cec45262164240e
3
+ size 1816396048
checkpoint-100/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<image_soft_token>": 262144
3
+ }
checkpoint-100/chat_template.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "chat_template": "{{ bos_token }}\n{%- if messages[0]['role'] == 'system' -%}\n {%- if messages[0]['content'] is string -%}\n {%- set first_user_prefix = messages[0]['content'] + '\n\n' -%}\n {%- else -%}\n {%- set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' -%}\n {%- endif -%}\n {%- set loop_messages = messages[1:] -%}\n{%- else -%}\n {%- set first_user_prefix = \"\" -%}\n {%- set loop_messages = messages -%}\n{%- endif -%}\n{%- for message in loop_messages -%}\n {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}\n {{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}\n {%- endif -%}\n {%- if (message['role'] == 'assistant') -%}\n {%- set role = \"model\" -%}\n {%- else -%}\n {%- set role = message['role'] -%}\n {%- endif -%}\n {{ '<start_of_turn>' + role + '\n' + (first_user_prefix if loop.first else \"\") }}\n {%- if message['content'] is string -%}\n {{ message['content'] | trim }}\n {%- elif message['content'] is iterable -%}\n {%- for item in message['content'] -%}\n {%- if item['type'] == 'image' -%}\n {{ '<start_of_image>' }}\n {%- elif item['type'] == 'text' -%}\n {{ item['text'] | trim }}\n {%- endif -%}\n {%- endfor -%}\n {%- else -%}\n {{ raise_exception(\"Invalid content type\") }}\n {%- endif -%}\n {{ '<end_of_turn>\n' }}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n {{'<start_of_turn>model\n'}}\n{%- endif -%}\n"
3
+ }
checkpoint-100/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b080671f0e69b9a84f544b65077293d663bf58eb0fa6a5d53ad9578e97d7cad6
3
+ size 3633264318
checkpoint-100/preprocessor_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_rgb": null,
3
+ "do_normalize": true,
4
+ "do_pan_and_scan": null,
5
+ "do_rescale": true,
6
+ "do_resize": true,
7
+ "image_mean": [
8
+ 0.5,
9
+ 0.5,
10
+ 0.5
11
+ ],
12
+ "image_processor_type": "Gemma3ImageProcessor",
13
+ "image_seq_length": 256,
14
+ "image_std": [
15
+ 0.5,
16
+ 0.5,
17
+ 0.5
18
+ ],
19
+ "pan_and_scan_max_num_crops": null,
20
+ "pan_and_scan_min_crop_size": null,
21
+ "pan_and_scan_min_ratio_to_activate": null,
22
+ "processor_class": "Gemma3Processor",
23
+ "resample": 2,
24
+ "rescale_factor": 0.00392156862745098,
25
+ "size": {
26
+ "height": 896,
27
+ "width": 896
28
+ }
29
+ }
checkpoint-100/processor_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "image_seq_length": 256,
3
+ "processor_class": "Gemma3Processor"
4
+ }
checkpoint-100/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69f50a692634404f2eebb2eab9f456865957578d752987bc52d843ac2a774366
3
+ size 14244
checkpoint-100/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5c461e660f4a5bcff7eb4fae4e538307f4a5201ca1dd3030274daac7a6afbc8
3
+ size 1064
checkpoint-100/special_tokens_map.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<end_of_turn>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ }
10
+ ],
11
+ "boi_token": "<start_of_image>",
12
+ "bos_token": {
13
+ "content": "<bos>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false
18
+ },
19
+ "eoi_token": "<end_of_image>",
20
+ "eos_token": {
21
+ "content": "<eos>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ },
27
+ "image_token": "<image_soft_token>",
28
+ "pad_token": {
29
+ "content": "<pad>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false
34
+ },
35
+ "unk_token": {
36
+ "content": "<unk>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false
41
+ }
42
+ }
checkpoint-100/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4667f2089529e8e7657cfb6d1c19910ae71ff5f28aa7ab2ff2763330affad795
3
+ size 33384568
checkpoint-100/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c
3
+ size 4689074
checkpoint-100/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-100/trainer_state.json ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 6.666666666666667,
6
+ "eval_steps": 500,
7
+ "global_step": 100,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.3333333333333333,
14
+ "grad_norm": 79.55762481689453,
15
+ "learning_rate": 4.9786121534345265e-05,
16
+ "loss": 45.253,
17
+ "num_input_tokens_seen": 47512,
18
+ "step": 5
19
+ },
20
+ {
21
+ "epoch": 0.6666666666666666,
22
+ "grad_norm": 17.002260208129883,
23
+ "learning_rate": 4.914814565722671e-05,
24
+ "loss": 3.0254,
25
+ "num_input_tokens_seen": 94536,
26
+ "step": 10
27
+ },
28
+ {
29
+ "epoch": 1.0,
30
+ "grad_norm": 35.45703887939453,
31
+ "learning_rate": 4.8096988312782174e-05,
32
+ "loss": 3.1619,
33
+ "num_input_tokens_seen": 141336,
34
+ "step": 15
35
+ },
36
+ {
37
+ "epoch": 1.3333333333333333,
38
+ "grad_norm": 32.79413604736328,
39
+ "learning_rate": 4.665063509461097e-05,
40
+ "loss": 1.7871,
41
+ "num_input_tokens_seen": 188368,
42
+ "step": 20
43
+ },
44
+ {
45
+ "epoch": 1.6666666666666665,
46
+ "grad_norm": 57.80039978027344,
47
+ "learning_rate": 4.4833833507280884e-05,
48
+ "loss": 2.643,
49
+ "num_input_tokens_seen": 235128,
50
+ "step": 25
51
+ },
52
+ {
53
+ "epoch": 2.0,
54
+ "grad_norm": 8.35395622253418,
55
+ "learning_rate": 4.267766952966369e-05,
56
+ "loss": 2.3801,
57
+ "num_input_tokens_seen": 282672,
58
+ "step": 30
59
+ },
60
+ {
61
+ "epoch": 2.3333333333333335,
62
+ "grad_norm": 13.160042762756348,
63
+ "learning_rate": 4.021903572521802e-05,
64
+ "loss": 1.6999,
65
+ "num_input_tokens_seen": 329864,
66
+ "step": 35
67
+ },
68
+ {
69
+ "epoch": 2.6666666666666665,
70
+ "grad_norm": 31.193811416625977,
71
+ "learning_rate": 3.7500000000000003e-05,
72
+ "loss": 1.5571,
73
+ "num_input_tokens_seen": 377952,
74
+ "step": 40
75
+ },
76
+ {
77
+ "epoch": 3.0,
78
+ "grad_norm": 57.113609313964844,
79
+ "learning_rate": 3.456708580912725e-05,
80
+ "loss": 1.7335,
81
+ "num_input_tokens_seen": 424008,
82
+ "step": 45
83
+ },
84
+ {
85
+ "epoch": 3.3333333333333335,
86
+ "grad_norm": 26.844955444335938,
87
+ "learning_rate": 3.147047612756302e-05,
88
+ "loss": 1.1859,
89
+ "num_input_tokens_seen": 471344,
90
+ "step": 50
91
+ },
92
+ {
93
+ "epoch": 3.6666666666666665,
94
+ "grad_norm": 19.28535270690918,
95
+ "learning_rate": 2.8263154805501297e-05,
96
+ "loss": 0.9846,
97
+ "num_input_tokens_seen": 517832,
98
+ "step": 55
99
+ },
100
+ {
101
+ "epoch": 4.0,
102
+ "grad_norm": 16.303863525390625,
103
+ "learning_rate": 2.5e-05,
104
+ "loss": 0.901,
105
+ "num_input_tokens_seen": 565344,
106
+ "step": 60
107
+ },
108
+ {
109
+ "epoch": 4.333333333333333,
110
+ "grad_norm": 38.277183532714844,
111
+ "learning_rate": 2.173684519449872e-05,
112
+ "loss": 0.4142,
113
+ "num_input_tokens_seen": 612336,
114
+ "step": 65
115
+ },
116
+ {
117
+ "epoch": 4.666666666666667,
118
+ "grad_norm": 16.51125144958496,
119
+ "learning_rate": 1.852952387243698e-05,
120
+ "loss": 0.6221,
121
+ "num_input_tokens_seen": 659560,
122
+ "step": 70
123
+ },
124
+ {
125
+ "epoch": 5.0,
126
+ "grad_norm": 55.58554458618164,
127
+ "learning_rate": 1.5432914190872757e-05,
128
+ "loss": 0.7115,
129
+ "num_input_tokens_seen": 706680,
130
+ "step": 75
131
+ },
132
+ {
133
+ "epoch": 5.333333333333333,
134
+ "grad_norm": 7.757297992706299,
135
+ "learning_rate": 1.2500000000000006e-05,
136
+ "loss": 0.1582,
137
+ "num_input_tokens_seen": 754048,
138
+ "step": 80
139
+ },
140
+ {
141
+ "epoch": 5.666666666666667,
142
+ "grad_norm": 6.108384609222412,
143
+ "learning_rate": 9.780964274781984e-06,
144
+ "loss": 0.1891,
145
+ "num_input_tokens_seen": 800984,
146
+ "step": 85
147
+ },
148
+ {
149
+ "epoch": 6.0,
150
+ "grad_norm": 0.9440665245056152,
151
+ "learning_rate": 7.3223304703363135e-06,
152
+ "loss": 0.2214,
153
+ "num_input_tokens_seen": 848016,
154
+ "step": 90
155
+ },
156
+ {
157
+ "epoch": 6.333333333333333,
158
+ "grad_norm": 0.9317387342453003,
159
+ "learning_rate": 5.166166492719124e-06,
160
+ "loss": 0.0135,
161
+ "num_input_tokens_seen": 895040,
162
+ "step": 95
163
+ },
164
+ {
165
+ "epoch": 6.666666666666667,
166
+ "grad_norm": 0.1581590473651886,
167
+ "learning_rate": 3.3493649053890326e-06,
168
+ "loss": 0.0055,
169
+ "num_input_tokens_seen": 941960,
170
+ "step": 100
171
+ }
172
+ ],
173
+ "logging_steps": 5,
174
+ "max_steps": 120,
175
+ "num_input_tokens_seen": 941960,
176
+ "num_train_epochs": 8,
177
+ "save_steps": 100,
178
+ "stateful_callbacks": {
179
+ "TrainerControl": {
180
+ "args": {
181
+ "should_epoch_stop": false,
182
+ "should_evaluate": false,
183
+ "should_log": false,
184
+ "should_save": true,
185
+ "should_training_stop": false
186
+ },
187
+ "attributes": {}
188
+ }
189
+ },
190
+ "total_flos": 1.4961409162866816e+17,
191
+ "train_batch_size": 1,
192
+ "trial_name": null,
193
+ "trial_params": null
194
+ }
checkpoint-100/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ef6c21cfca5f2537c2c77045da591ab94a70a2c3c6f8193b51ea235f03d35bf
3
+ size 5688
checkpoint-120/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: google/gemma-3-27b-it
3
+ library_name: peft
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.12.0
checkpoint-120/adapter_config.json ADDED
@@ -0,0 +1,461 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "google/gemma-3-27b-it",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 128,
14
+ "lora_dropout": 0,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 64,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "language_model.model.layers.18.mlp.up_proj",
24
+ "language_model.model.layers.14.self_attn.q_proj",
25
+ "language_model.model.layers.23.mlp.down_proj",
26
+ "language_model.model.layers.5.self_attn.q_proj",
27
+ "language_model.model.layers.48.self_attn.k_proj",
28
+ "language_model.model.layers.29.self_attn.v_proj",
29
+ "language_model.model.layers.39.self_attn.v_proj",
30
+ "language_model.model.layers.5.self_attn.v_proj",
31
+ "language_model.model.layers.60.mlp.gate_proj",
32
+ "language_model.model.layers.17.self_attn.o_proj",
33
+ "language_model.model.layers.60.mlp.down_proj",
34
+ "language_model.model.layers.33.mlp.up_proj",
35
+ "language_model.model.layers.53.self_attn.q_proj",
36
+ "language_model.model.layers.27.mlp.down_proj",
37
+ "language_model.model.layers.57.self_attn.v_proj",
38
+ "language_model.model.layers.26.mlp.up_proj",
39
+ "language_model.model.layers.10.mlp.gate_proj",
40
+ "language_model.model.layers.23.mlp.up_proj",
41
+ "language_model.model.layers.57.self_attn.q_proj",
42
+ "language_model.model.layers.57.mlp.up_proj",
43
+ "language_model.model.layers.39.mlp.gate_proj",
44
+ "language_model.model.layers.26.mlp.down_proj",
45
+ "language_model.model.layers.11.mlp.up_proj",
46
+ "language_model.model.layers.25.mlp.up_proj",
47
+ "language_model.model.layers.6.mlp.gate_proj",
48
+ "language_model.model.layers.36.mlp.gate_proj",
49
+ "language_model.model.layers.2.self_attn.o_proj",
50
+ "language_model.model.layers.30.self_attn.o_proj",
51
+ "language_model.model.layers.4.mlp.gate_proj",
52
+ "language_model.model.layers.13.mlp.up_proj",
53
+ "language_model.model.layers.52.mlp.down_proj",
54
+ "language_model.model.layers.4.mlp.up_proj",
55
+ "language_model.model.layers.32.self_attn.v_proj",
56
+ "language_model.model.layers.1.self_attn.o_proj",
57
+ "language_model.model.layers.22.mlp.down_proj",
58
+ "language_model.model.layers.45.mlp.down_proj",
59
+ "language_model.model.layers.46.self_attn.o_proj",
60
+ "language_model.model.layers.38.self_attn.o_proj",
61
+ "language_model.model.layers.23.self_attn.q_proj",
62
+ "language_model.model.layers.46.mlp.up_proj",
63
+ "language_model.model.layers.49.mlp.down_proj",
64
+ "language_model.model.layers.40.self_attn.k_proj",
65
+ "language_model.model.layers.24.self_attn.q_proj",
66
+ "language_model.model.layers.9.self_attn.o_proj",
67
+ "language_model.model.layers.37.mlp.gate_proj",
68
+ "language_model.model.layers.36.self_attn.k_proj",
69
+ "language_model.model.layers.49.self_attn.k_proj",
70
+ "language_model.model.layers.24.self_attn.v_proj",
71
+ "language_model.model.layers.1.mlp.down_proj",
72
+ "language_model.model.layers.10.mlp.down_proj",
73
+ "language_model.model.layers.61.mlp.gate_proj",
74
+ "language_model.model.layers.34.self_attn.o_proj",
75
+ "language_model.model.layers.9.mlp.up_proj",
76
+ "language_model.model.layers.59.self_attn.v_proj",
77
+ "language_model.model.layers.30.self_attn.q_proj",
78
+ "language_model.model.layers.44.mlp.gate_proj",
79
+ "language_model.model.layers.46.mlp.down_proj",
80
+ "language_model.model.layers.46.self_attn.k_proj",
81
+ "language_model.model.layers.28.self_attn.v_proj",
82
+ "language_model.model.layers.58.self_attn.o_proj",
83
+ "language_model.model.layers.13.self_attn.o_proj",
84
+ "language_model.model.layers.24.mlp.up_proj",
85
+ "language_model.model.layers.0.self_attn.q_proj",
86
+ "language_model.model.layers.58.mlp.up_proj",
87
+ "language_model.model.layers.12.mlp.down_proj",
88
+ "language_model.model.layers.43.mlp.up_proj",
89
+ "language_model.model.layers.44.self_attn.k_proj",
90
+ "language_model.model.layers.17.self_attn.q_proj",
91
+ "language_model.model.layers.59.mlp.up_proj",
92
+ "language_model.model.layers.50.mlp.down_proj",
93
+ "language_model.model.layers.24.self_attn.k_proj",
94
+ "language_model.model.layers.59.self_attn.k_proj",
95
+ "language_model.model.layers.1.mlp.up_proj",
96
+ "language_model.model.layers.58.self_attn.q_proj",
97
+ "language_model.model.layers.51.mlp.gate_proj",
98
+ "language_model.model.layers.4.self_attn.v_proj",
99
+ "language_model.model.layers.45.mlp.gate_proj",
100
+ "language_model.model.layers.8.mlp.gate_proj",
101
+ "language_model.model.layers.43.self_attn.v_proj",
102
+ "language_model.model.layers.51.self_attn.v_proj",
103
+ "language_model.model.layers.37.self_attn.v_proj",
104
+ "language_model.model.layers.51.self_attn.q_proj",
105
+ "language_model.model.layers.51.self_attn.o_proj",
106
+ "language_model.model.layers.33.mlp.down_proj",
107
+ "language_model.model.layers.44.self_attn.v_proj",
108
+ "language_model.model.layers.18.self_attn.o_proj",
109
+ "language_model.model.layers.14.self_attn.k_proj",
110
+ "language_model.model.layers.55.self_attn.k_proj",
111
+ "language_model.model.layers.10.self_attn.q_proj",
112
+ "language_model.model.layers.34.mlp.up_proj",
113
+ "language_model.model.layers.27.self_attn.v_proj",
114
+ "language_model.model.layers.20.self_attn.o_proj",
115
+ "language_model.model.layers.33.self_attn.k_proj",
116
+ "language_model.model.layers.44.mlp.up_proj",
117
+ "language_model.model.layers.3.self_attn.o_proj",
118
+ "language_model.model.layers.15.mlp.up_proj",
119
+ "language_model.model.layers.17.mlp.gate_proj",
120
+ "language_model.model.layers.17.mlp.up_proj",
121
+ "language_model.model.layers.55.mlp.up_proj",
122
+ "language_model.model.layers.0.self_attn.v_proj",
123
+ "language_model.model.layers.56.self_attn.v_proj",
124
+ "language_model.model.layers.39.self_attn.k_proj",
125
+ "language_model.model.layers.31.self_attn.k_proj",
126
+ "language_model.model.layers.50.self_attn.v_proj",
127
+ "language_model.model.layers.8.self_attn.q_proj",
128
+ "language_model.model.layers.29.self_attn.k_proj",
129
+ "language_model.model.layers.28.self_attn.q_proj",
130
+ "language_model.model.layers.37.self_attn.k_proj",
131
+ "language_model.model.layers.8.self_attn.k_proj",
132
+ "language_model.model.layers.60.self_attn.q_proj",
133
+ "language_model.model.layers.13.self_attn.v_proj",
134
+ "language_model.model.layers.18.self_attn.v_proj",
135
+ "language_model.model.layers.35.mlp.up_proj",
136
+ "language_model.model.layers.56.self_attn.k_proj",
137
+ "language_model.model.layers.27.self_attn.o_proj",
138
+ "language_model.model.layers.40.self_attn.q_proj",
139
+ "language_model.model.layers.25.mlp.down_proj",
140
+ "language_model.model.layers.50.self_attn.k_proj",
141
+ "language_model.model.layers.39.mlp.up_proj",
142
+ "language_model.model.layers.13.mlp.down_proj",
143
+ "language_model.model.layers.39.self_attn.o_proj",
144
+ "language_model.model.layers.56.mlp.down_proj",
145
+ "language_model.model.layers.13.self_attn.k_proj",
146
+ "language_model.model.layers.10.self_attn.k_proj",
147
+ "language_model.model.layers.42.self_attn.v_proj",
148
+ "language_model.model.layers.3.mlp.down_proj",
149
+ "language_model.model.layers.9.self_attn.q_proj",
150
+ "language_model.model.layers.36.mlp.up_proj",
151
+ "language_model.model.layers.29.mlp.down_proj",
152
+ "language_model.model.layers.51.mlp.down_proj",
153
+ "language_model.model.layers.36.self_attn.o_proj",
154
+ "language_model.model.layers.7.mlp.gate_proj",
155
+ "language_model.model.layers.31.mlp.gate_proj",
156
+ "language_model.model.layers.38.mlp.gate_proj",
157
+ "language_model.model.layers.55.mlp.down_proj",
158
+ "language_model.model.layers.30.mlp.down_proj",
159
+ "language_model.model.layers.54.mlp.gate_proj",
160
+ "language_model.model.layers.42.self_attn.q_proj",
161
+ "language_model.model.layers.0.mlp.down_proj",
162
+ "language_model.model.layers.32.self_attn.o_proj",
163
+ "language_model.model.layers.61.self_attn.q_proj",
164
+ "language_model.model.layers.55.self_attn.o_proj",
165
+ "language_model.model.layers.27.self_attn.q_proj",
166
+ "language_model.model.layers.41.mlp.up_proj",
167
+ "language_model.model.layers.2.mlp.down_proj",
168
+ "language_model.model.layers.48.self_attn.v_proj",
169
+ "language_model.model.layers.55.mlp.gate_proj",
170
+ "language_model.model.layers.5.self_attn.o_proj",
171
+ "language_model.model.layers.53.mlp.gate_proj",
172
+ "language_model.model.layers.26.mlp.gate_proj",
173
+ "language_model.model.layers.56.mlp.gate_proj",
174
+ "language_model.model.layers.53.mlp.up_proj",
175
+ "language_model.model.layers.16.self_attn.k_proj",
176
+ "language_model.model.layers.43.mlp.down_proj",
177
+ "language_model.model.layers.1.self_attn.k_proj",
178
+ "language_model.model.layers.19.self_attn.q_proj",
179
+ "language_model.model.layers.3.self_attn.k_proj",
180
+ "language_model.model.layers.21.self_attn.q_proj",
181
+ "language_model.model.layers.15.self_attn.o_proj",
182
+ "language_model.model.layers.57.self_attn.o_proj",
183
+ "language_model.model.layers.49.self_attn.o_proj",
184
+ "language_model.model.layers.50.self_attn.q_proj",
185
+ "language_model.model.layers.58.mlp.down_proj",
186
+ "language_model.model.layers.26.self_attn.k_proj",
187
+ "language_model.model.layers.38.self_attn.v_proj",
188
+ "language_model.model.layers.19.self_attn.v_proj",
189
+ "language_model.model.layers.19.mlp.up_proj",
190
+ "language_model.model.layers.3.self_attn.q_proj",
191
+ "language_model.model.layers.7.mlp.down_proj",
192
+ "language_model.model.layers.9.self_attn.k_proj",
193
+ "language_model.model.layers.29.mlp.up_proj",
194
+ "language_model.model.layers.49.self_attn.q_proj",
195
+ "language_model.model.layers.13.self_attn.q_proj",
196
+ "language_model.model.layers.59.self_attn.o_proj",
197
+ "language_model.model.layers.3.mlp.up_proj",
198
+ "language_model.model.layers.8.self_attn.v_proj",
199
+ "language_model.model.layers.0.self_attn.o_proj",
200
+ "language_model.model.layers.2.mlp.gate_proj",
201
+ "language_model.model.layers.16.self_attn.v_proj",
202
+ "language_model.model.layers.10.self_attn.v_proj",
203
+ "language_model.model.layers.16.mlp.down_proj",
204
+ "language_model.model.layers.20.mlp.gate_proj",
205
+ "language_model.model.layers.55.self_attn.v_proj",
206
+ "language_model.model.layers.49.self_attn.v_proj",
207
+ "language_model.model.layers.17.mlp.down_proj",
208
+ "language_model.model.layers.18.mlp.down_proj",
209
+ "language_model.model.layers.57.mlp.down_proj",
210
+ "language_model.model.layers.40.mlp.down_proj",
211
+ "language_model.model.layers.27.mlp.gate_proj",
212
+ "language_model.model.layers.17.self_attn.k_proj",
213
+ "language_model.model.layers.40.self_attn.o_proj",
214
+ "language_model.model.layers.12.mlp.gate_proj",
215
+ "language_model.model.layers.18.self_attn.q_proj",
216
+ "language_model.model.layers.54.self_attn.q_proj",
217
+ "language_model.model.layers.37.self_attn.o_proj",
218
+ "language_model.model.layers.20.self_attn.q_proj",
219
+ "language_model.model.layers.31.mlp.down_proj",
220
+ "language_model.model.layers.10.mlp.up_proj",
221
+ "language_model.model.layers.7.self_attn.k_proj",
222
+ "language_model.model.layers.1.self_attn.q_proj",
223
+ "language_model.model.layers.5.mlp.gate_proj",
224
+ "language_model.model.layers.61.mlp.down_proj",
225
+ "language_model.model.layers.46.self_attn.v_proj",
226
+ "language_model.model.layers.12.self_attn.v_proj",
227
+ "language_model.model.layers.54.self_attn.o_proj",
228
+ "language_model.model.layers.29.self_attn.o_proj",
229
+ "language_model.model.layers.61.self_attn.k_proj",
230
+ "language_model.model.layers.61.mlp.up_proj",
231
+ "language_model.model.layers.12.self_attn.o_proj",
232
+ "language_model.model.layers.5.mlp.up_proj",
233
+ "language_model.model.layers.54.mlp.down_proj",
234
+ "language_model.model.layers.53.self_attn.v_proj",
235
+ "language_model.model.layers.38.self_attn.k_proj",
236
+ "language_model.model.layers.42.self_attn.o_proj",
237
+ "language_model.model.layers.47.mlp.gate_proj",
238
+ "language_model.model.layers.25.mlp.gate_proj",
239
+ "language_model.model.layers.38.self_attn.q_proj",
240
+ "language_model.model.layers.11.self_attn.k_proj",
241
+ "language_model.model.layers.3.self_attn.v_proj",
242
+ "language_model.model.layers.61.self_attn.v_proj",
243
+ "language_model.model.layers.23.self_attn.k_proj",
244
+ "language_model.model.layers.16.self_attn.o_proj",
245
+ "language_model.model.layers.25.self_attn.q_proj",
246
+ "language_model.model.layers.37.mlp.up_proj",
247
+ "language_model.model.layers.13.mlp.gate_proj",
248
+ "language_model.model.layers.24.self_attn.o_proj",
249
+ "language_model.model.layers.35.self_attn.q_proj",
250
+ "language_model.model.layers.59.self_attn.q_proj",
251
+ "language_model.model.layers.17.self_attn.v_proj",
252
+ "language_model.model.layers.15.mlp.down_proj",
253
+ "language_model.model.layers.48.self_attn.q_proj",
254
+ "language_model.model.layers.61.self_attn.o_proj",
255
+ "language_model.model.layers.30.self_attn.k_proj",
256
+ "language_model.model.layers.21.mlp.up_proj",
257
+ "language_model.model.layers.44.mlp.down_proj",
258
+ "language_model.model.layers.12.self_attn.k_proj",
259
+ "language_model.model.layers.31.self_attn.q_proj",
260
+ "language_model.model.layers.31.self_attn.v_proj",
261
+ "language_model.model.layers.1.mlp.gate_proj",
262
+ "language_model.model.layers.22.self_attn.o_proj",
263
+ "language_model.model.layers.47.mlp.down_proj",
264
+ "language_model.model.layers.4.self_attn.k_proj",
265
+ "language_model.model.layers.25.self_attn.k_proj",
266
+ "language_model.model.layers.41.self_attn.k_proj",
267
+ "language_model.model.layers.33.self_attn.v_proj",
268
+ "language_model.model.layers.26.self_attn.q_proj",
269
+ "language_model.model.layers.9.mlp.down_proj",
270
+ "language_model.model.layers.45.self_attn.k_proj",
271
+ "language_model.model.layers.38.mlp.up_proj",
272
+ "language_model.model.layers.0.mlp.up_proj",
273
+ "language_model.model.layers.59.mlp.gate_proj",
274
+ "language_model.model.layers.5.self_attn.k_proj",
275
+ "language_model.model.layers.10.self_attn.o_proj",
276
+ "language_model.model.layers.60.mlp.up_proj",
277
+ "language_model.model.layers.26.self_attn.v_proj",
278
+ "language_model.model.layers.40.mlp.gate_proj",
279
+ "language_model.model.layers.60.self_attn.o_proj",
280
+ "language_model.model.layers.0.mlp.gate_proj",
281
+ "language_model.model.layers.39.mlp.down_proj",
282
+ "language_model.model.layers.28.self_attn.k_proj",
283
+ "language_model.model.layers.19.self_attn.o_proj",
284
+ "language_model.model.layers.43.self_attn.k_proj",
285
+ "language_model.model.layers.11.self_attn.q_proj",
286
+ "language_model.model.layers.41.mlp.gate_proj",
287
+ "language_model.model.layers.35.mlp.down_proj",
288
+ "language_model.model.layers.52.self_attn.o_proj",
289
+ "language_model.model.layers.32.self_attn.q_proj",
290
+ "language_model.model.layers.30.mlp.up_proj",
291
+ "language_model.model.layers.47.self_attn.q_proj",
292
+ "language_model.model.layers.21.mlp.down_proj",
293
+ "language_model.model.layers.24.mlp.gate_proj",
294
+ "language_model.model.layers.53.self_attn.o_proj",
295
+ "language_model.model.layers.42.mlp.down_proj",
296
+ "language_model.model.layers.44.self_attn.o_proj",
297
+ "language_model.model.layers.24.mlp.down_proj",
298
+ "language_model.model.layers.29.self_attn.q_proj",
299
+ "language_model.model.layers.49.mlp.up_proj",
300
+ "language_model.model.layers.4.self_attn.q_proj",
301
+ "language_model.model.layers.56.mlp.up_proj",
302
+ "language_model.model.layers.8.mlp.down_proj",
303
+ "language_model.model.layers.25.self_attn.v_proj",
304
+ "language_model.model.layers.37.self_attn.q_proj",
305
+ "language_model.model.layers.6.self_attn.k_proj",
306
+ "language_model.model.layers.14.self_attn.o_proj",
307
+ "language_model.model.layers.60.self_attn.k_proj",
308
+ "language_model.model.layers.2.mlp.up_proj",
309
+ "language_model.model.layers.34.mlp.gate_proj",
310
+ "language_model.model.layers.52.mlp.gate_proj",
311
+ "language_model.model.layers.6.mlp.down_proj",
312
+ "language_model.model.layers.45.self_attn.q_proj",
313
+ "language_model.model.layers.41.self_attn.q_proj",
314
+ "language_model.model.layers.52.self_attn.k_proj",
315
+ "language_model.model.layers.36.self_attn.v_proj",
316
+ "language_model.model.layers.28.mlp.down_proj",
317
+ "language_model.model.layers.15.self_attn.v_proj",
318
+ "language_model.model.layers.11.self_attn.o_proj",
319
+ "language_model.model.layers.29.mlp.gate_proj",
320
+ "language_model.model.layers.42.self_attn.k_proj",
321
+ "language_model.model.layers.52.mlp.up_proj",
322
+ "language_model.model.layers.22.self_attn.k_proj",
323
+ "language_model.model.layers.14.mlp.down_proj",
324
+ "language_model.model.layers.4.mlp.down_proj",
325
+ "language_model.model.layers.35.self_attn.k_proj",
326
+ "language_model.model.layers.52.self_attn.q_proj",
327
+ "language_model.model.layers.22.self_attn.q_proj",
328
+ "language_model.model.layers.58.mlp.gate_proj",
329
+ "language_model.model.layers.14.mlp.gate_proj",
330
+ "language_model.model.layers.47.self_attn.k_proj",
331
+ "language_model.model.layers.39.self_attn.q_proj",
332
+ "language_model.model.layers.42.mlp.up_proj",
333
+ "language_model.model.layers.34.mlp.down_proj",
334
+ "language_model.model.layers.30.self_attn.v_proj",
335
+ "language_model.model.layers.56.self_attn.o_proj",
336
+ "language_model.model.layers.25.self_attn.o_proj",
337
+ "language_model.model.layers.45.mlp.up_proj",
338
+ "language_model.model.layers.48.mlp.down_proj",
339
+ "language_model.model.layers.7.self_attn.o_proj",
340
+ "language_model.model.layers.18.self_attn.k_proj",
341
+ "language_model.model.layers.14.self_attn.v_proj",
342
+ "language_model.model.layers.40.self_attn.v_proj",
343
+ "language_model.model.layers.22.self_attn.v_proj",
344
+ "language_model.model.layers.7.self_attn.q_proj",
345
+ "language_model.model.layers.46.mlp.gate_proj",
346
+ "language_model.model.layers.56.self_attn.q_proj",
347
+ "language_model.model.layers.28.mlp.up_proj",
348
+ "language_model.model.layers.50.mlp.gate_proj",
349
+ "language_model.model.layers.23.self_attn.v_proj",
350
+ "language_model.model.layers.15.self_attn.q_proj",
351
+ "language_model.model.layers.9.mlp.gate_proj",
352
+ "language_model.model.layers.47.mlp.up_proj",
353
+ "language_model.model.layers.6.self_attn.o_proj",
354
+ "language_model.model.layers.12.self_attn.q_proj",
355
+ "language_model.model.layers.20.self_attn.k_proj",
356
+ "language_model.model.layers.51.mlp.up_proj",
357
+ "language_model.model.layers.58.self_attn.v_proj",
358
+ "language_model.model.layers.22.mlp.gate_proj",
359
+ "language_model.model.layers.14.mlp.up_proj",
360
+ "language_model.model.layers.33.self_attn.o_proj",
361
+ "language_model.model.layers.11.mlp.down_proj",
362
+ "language_model.model.layers.16.mlp.gate_proj",
363
+ "language_model.model.layers.40.mlp.up_proj",
364
+ "language_model.model.layers.60.self_attn.v_proj",
365
+ "language_model.model.layers.2.self_attn.k_proj",
366
+ "language_model.model.layers.26.self_attn.o_proj",
367
+ "language_model.model.layers.12.mlp.up_proj",
368
+ "language_model.model.layers.28.self_attn.o_proj",
369
+ "language_model.model.layers.32.mlp.up_proj",
370
+ "language_model.model.layers.45.self_attn.o_proj",
371
+ "language_model.model.layers.28.mlp.gate_proj",
372
+ "language_model.model.layers.11.mlp.gate_proj",
373
+ "language_model.model.layers.11.self_attn.v_proj",
374
+ "language_model.model.layers.2.self_attn.v_proj",
375
+ "language_model.model.layers.9.self_attn.v_proj",
376
+ "language_model.model.layers.19.self_attn.k_proj",
377
+ "language_model.model.layers.32.self_attn.k_proj",
378
+ "language_model.model.layers.43.self_attn.q_proj",
379
+ "language_model.model.layers.21.mlp.gate_proj",
380
+ "language_model.model.layers.45.self_attn.v_proj",
381
+ "language_model.model.layers.41.mlp.down_proj",
382
+ "language_model.model.layers.36.mlp.down_proj",
383
+ "language_model.model.layers.53.self_attn.k_proj",
384
+ "language_model.model.layers.16.self_attn.q_proj",
385
+ "language_model.model.layers.3.mlp.gate_proj",
386
+ "language_model.model.layers.15.self_attn.k_proj",
387
+ "language_model.model.layers.33.mlp.gate_proj",
388
+ "language_model.model.layers.48.mlp.up_proj",
389
+ "language_model.model.layers.33.self_attn.q_proj",
390
+ "language_model.model.layers.54.self_attn.k_proj",
391
+ "language_model.model.layers.7.self_attn.v_proj",
392
+ "language_model.model.layers.5.mlp.down_proj",
393
+ "language_model.model.layers.50.self_attn.o_proj",
394
+ "language_model.model.layers.21.self_attn.o_proj",
395
+ "language_model.model.layers.2.self_attn.q_proj",
396
+ "language_model.model.layers.27.self_attn.k_proj",
397
+ "language_model.model.layers.46.self_attn.q_proj",
398
+ "language_model.model.layers.20.self_attn.v_proj",
399
+ "language_model.model.layers.34.self_attn.k_proj",
400
+ "language_model.model.layers.18.mlp.gate_proj",
401
+ "language_model.model.layers.35.self_attn.v_proj",
402
+ "language_model.model.layers.32.mlp.down_proj",
403
+ "language_model.model.layers.6.self_attn.q_proj",
404
+ "language_model.model.layers.20.mlp.down_proj",
405
+ "language_model.model.layers.27.mlp.up_proj",
406
+ "language_model.model.layers.31.self_attn.o_proj",
407
+ "language_model.model.layers.59.mlp.down_proj",
408
+ "language_model.model.layers.4.self_attn.o_proj",
409
+ "language_model.model.layers.15.mlp.gate_proj",
410
+ "language_model.model.layers.44.self_attn.q_proj",
411
+ "language_model.model.layers.31.mlp.up_proj",
412
+ "language_model.model.layers.30.mlp.gate_proj",
413
+ "language_model.model.layers.42.mlp.gate_proj",
414
+ "language_model.model.layers.19.mlp.gate_proj",
415
+ "language_model.model.layers.38.mlp.down_proj",
416
+ "language_model.model.layers.23.self_attn.o_proj",
417
+ "language_model.model.layers.16.mlp.up_proj",
418
+ "language_model.model.layers.52.self_attn.v_proj",
419
+ "language_model.model.layers.22.mlp.up_proj",
420
+ "language_model.model.layers.41.self_attn.o_proj",
421
+ "language_model.model.layers.19.mlp.down_proj",
422
+ "language_model.model.layers.37.mlp.down_proj",
423
+ "language_model.model.layers.8.self_attn.o_proj",
424
+ "language_model.model.layers.57.self_attn.k_proj",
425
+ "language_model.model.layers.41.self_attn.v_proj",
426
+ "language_model.model.layers.21.self_attn.v_proj",
427
+ "language_model.model.layers.34.self_attn.q_proj",
428
+ "language_model.model.layers.47.self_attn.v_proj",
429
+ "language_model.model.layers.32.mlp.gate_proj",
430
+ "language_model.model.layers.43.self_attn.o_proj",
431
+ "language_model.model.layers.21.self_attn.k_proj",
432
+ "language_model.model.layers.1.self_attn.v_proj",
433
+ "language_model.model.layers.8.mlp.up_proj",
434
+ "language_model.model.layers.20.mlp.up_proj",
435
+ "language_model.model.layers.23.mlp.gate_proj",
436
+ "language_model.model.layers.43.mlp.gate_proj",
437
+ "language_model.model.layers.54.self_attn.v_proj",
438
+ "language_model.model.layers.54.mlp.up_proj",
439
+ "language_model.model.layers.50.mlp.up_proj",
440
+ "language_model.model.layers.48.mlp.gate_proj",
441
+ "language_model.model.layers.35.mlp.gate_proj",
442
+ "language_model.model.layers.0.self_attn.k_proj",
443
+ "language_model.model.layers.47.self_attn.o_proj",
444
+ "language_model.model.layers.51.self_attn.k_proj",
445
+ "language_model.model.layers.58.self_attn.k_proj",
446
+ "language_model.model.layers.35.self_attn.o_proj",
447
+ "language_model.model.layers.55.self_attn.q_proj",
448
+ "language_model.model.layers.57.mlp.gate_proj",
449
+ "language_model.model.layers.6.mlp.up_proj",
450
+ "language_model.model.layers.7.mlp.up_proj",
451
+ "language_model.model.layers.34.self_attn.v_proj",
452
+ "language_model.model.layers.48.self_attn.o_proj",
453
+ "language_model.model.layers.36.self_attn.q_proj",
454
+ "language_model.model.layers.6.self_attn.v_proj",
455
+ "language_model.model.layers.53.mlp.down_proj",
456
+ "language_model.model.layers.49.mlp.gate_proj"
457
+ ],
458
+ "task_type": "CAUSAL_LM",
459
+ "use_dora": false,
460
+ "use_rslora": false
461
+ }
checkpoint-120/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2858bf7ea1a8dfcfd2e9c7c507b264f099f9dbe08796a8fc21788c499f6d5ad8
3
+ size 1816396048
checkpoint-120/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<image_soft_token>": 262144
3
+ }
checkpoint-120/chat_template.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "chat_template": "{{ bos_token }}\n{%- if messages[0]['role'] == 'system' -%}\n {%- if messages[0]['content'] is string -%}\n {%- set first_user_prefix = messages[0]['content'] + '\n\n' -%}\n {%- else -%}\n {%- set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' -%}\n {%- endif -%}\n {%- set loop_messages = messages[1:] -%}\n{%- else -%}\n {%- set first_user_prefix = \"\" -%}\n {%- set loop_messages = messages -%}\n{%- endif -%}\n{%- for message in loop_messages -%}\n {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}\n {{ raise_exception(\"Conversation roles must alternate user/assistant/user/assistant/...\") }}\n {%- endif -%}\n {%- if (message['role'] == 'assistant') -%}\n {%- set role = \"model\" -%}\n {%- else -%}\n {%- set role = message['role'] -%}\n {%- endif -%}\n {{ '<start_of_turn>' + role + '\n' + (first_user_prefix if loop.first else \"\") }}\n {%- if message['content'] is string -%}\n {{ message['content'] | trim }}\n {%- elif message['content'] is iterable -%}\n {%- for item in message['content'] -%}\n {%- if item['type'] == 'image' -%}\n {{ '<start_of_image>' }}\n {%- elif item['type'] == 'text' -%}\n {{ item['text'] | trim }}\n {%- endif -%}\n {%- endfor -%}\n {%- else -%}\n {{ raise_exception(\"Invalid content type\") }}\n {%- endif -%}\n {{ '<end_of_turn>\n' }}\n{%- endfor -%}\n{%- if add_generation_prompt -%}\n {{'<start_of_turn>model\n'}}\n{%- endif -%}\n"
3
+ }
checkpoint-120/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc88d42378a7140b8aef7d0b8bde0e94b26a5479e6b49d2a1c648aeb84053478
3
+ size 3633264318
checkpoint-120/preprocessor_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_rgb": null,
3
+ "do_normalize": true,
4
+ "do_pan_and_scan": null,
5
+ "do_rescale": true,
6
+ "do_resize": true,
7
+ "image_mean": [
8
+ 0.5,
9
+ 0.5,
10
+ 0.5
11
+ ],
12
+ "image_processor_type": "Gemma3ImageProcessor",
13
+ "image_seq_length": 256,
14
+ "image_std": [
15
+ 0.5,
16
+ 0.5,
17
+ 0.5
18
+ ],
19
+ "pan_and_scan_max_num_crops": null,
20
+ "pan_and_scan_min_crop_size": null,
21
+ "pan_and_scan_min_ratio_to_activate": null,
22
+ "processor_class": "Gemma3Processor",
23
+ "resample": 2,
24
+ "rescale_factor": 0.00392156862745098,
25
+ "size": {
26
+ "height": 896,
27
+ "width": 896
28
+ }
29
+ }
checkpoint-120/processor_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "image_seq_length": 256,
3
+ "processor_class": "Gemma3Processor"
4
+ }
checkpoint-120/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9899ccda7f0d8d9511991180b93aab508ce6e8489de708c88ad1188e7e1d90d6
3
+ size 14244
checkpoint-120/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b0e3be07f2ddf2c035815fdad5db539ba623c588ed1a2904332a7a2674ae64a
3
+ size 1064
checkpoint-120/special_tokens_map.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<end_of_turn>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ }
10
+ ],
11
+ "boi_token": "<start_of_image>",
12
+ "bos_token": {
13
+ "content": "<bos>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false
18
+ },
19
+ "eoi_token": "<end_of_image>",
20
+ "eos_token": {
21
+ "content": "<eos>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ },
27
+ "image_token": "<image_soft_token>",
28
+ "pad_token": {
29
+ "content": "<pad>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false
34
+ },
35
+ "unk_token": {
36
+ "content": "<unk>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false
41
+ }
42
+ }
checkpoint-120/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4667f2089529e8e7657cfb6d1c19910ae71ff5f28aa7ab2ff2763330affad795
3
+ size 33384568
checkpoint-120/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c
3
+ size 4689074
checkpoint-120/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoint-120/trainer_state.json ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 8.0,
6
+ "eval_steps": 500,
7
+ "global_step": 120,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.3333333333333333,
14
+ "grad_norm": 79.55762481689453,
15
+ "learning_rate": 4.9786121534345265e-05,
16
+ "loss": 45.253,
17
+ "num_input_tokens_seen": 47512,
18
+ "step": 5
19
+ },
20
+ {
21
+ "epoch": 0.6666666666666666,
22
+ "grad_norm": 17.002260208129883,
23
+ "learning_rate": 4.914814565722671e-05,
24
+ "loss": 3.0254,
25
+ "num_input_tokens_seen": 94536,
26
+ "step": 10
27
+ },
28
+ {
29
+ "epoch": 1.0,
30
+ "grad_norm": 35.45703887939453,
31
+ "learning_rate": 4.8096988312782174e-05,
32
+ "loss": 3.1619,
33
+ "num_input_tokens_seen": 141336,
34
+ "step": 15
35
+ },
36
+ {
37
+ "epoch": 1.3333333333333333,
38
+ "grad_norm": 32.79413604736328,
39
+ "learning_rate": 4.665063509461097e-05,
40
+ "loss": 1.7871,
41
+ "num_input_tokens_seen": 188368,
42
+ "step": 20
43
+ },
44
+ {
45
+ "epoch": 1.6666666666666665,
46
+ "grad_norm": 57.80039978027344,
47
+ "learning_rate": 4.4833833507280884e-05,
48
+ "loss": 2.643,
49
+ "num_input_tokens_seen": 235128,
50
+ "step": 25
51
+ },
52
+ {
53
+ "epoch": 2.0,
54
+ "grad_norm": 8.35395622253418,
55
+ "learning_rate": 4.267766952966369e-05,
56
+ "loss": 2.3801,
57
+ "num_input_tokens_seen": 282672,
58
+ "step": 30
59
+ },
60
+ {
61
+ "epoch": 2.3333333333333335,
62
+ "grad_norm": 13.160042762756348,
63
+ "learning_rate": 4.021903572521802e-05,
64
+ "loss": 1.6999,
65
+ "num_input_tokens_seen": 329864,
66
+ "step": 35
67
+ },
68
+ {
69
+ "epoch": 2.6666666666666665,
70
+ "grad_norm": 31.193811416625977,
71
+ "learning_rate": 3.7500000000000003e-05,
72
+ "loss": 1.5571,
73
+ "num_input_tokens_seen": 377952,
74
+ "step": 40
75
+ },
76
+ {
77
+ "epoch": 3.0,
78
+ "grad_norm": 57.113609313964844,
79
+ "learning_rate": 3.456708580912725e-05,
80
+ "loss": 1.7335,
81
+ "num_input_tokens_seen": 424008,
82
+ "step": 45
83
+ },
84
+ {
85
+ "epoch": 3.3333333333333335,
86
+ "grad_norm": 26.844955444335938,
87
+ "learning_rate": 3.147047612756302e-05,
88
+ "loss": 1.1859,
89
+ "num_input_tokens_seen": 471344,
90
+ "step": 50
91
+ },
92
+ {
93
+ "epoch": 3.6666666666666665,
94
+ "grad_norm": 19.28535270690918,
95
+ "learning_rate": 2.8263154805501297e-05,
96
+ "loss": 0.9846,
97
+ "num_input_tokens_seen": 517832,
98
+ "step": 55
99
+ },
100
+ {
101
+ "epoch": 4.0,
102
+ "grad_norm": 16.303863525390625,
103
+ "learning_rate": 2.5e-05,
104
+ "loss": 0.901,
105
+ "num_input_tokens_seen": 565344,
106
+ "step": 60
107
+ },
108
+ {
109
+ "epoch": 4.333333333333333,
110
+ "grad_norm": 38.277183532714844,
111
+ "learning_rate": 2.173684519449872e-05,
112
+ "loss": 0.4142,
113
+ "num_input_tokens_seen": 612336,
114
+ "step": 65
115
+ },
116
+ {
117
+ "epoch": 4.666666666666667,
118
+ "grad_norm": 16.51125144958496,
119
+ "learning_rate": 1.852952387243698e-05,
120
+ "loss": 0.6221,
121
+ "num_input_tokens_seen": 659560,
122
+ "step": 70
123
+ },
124
+ {
125
+ "epoch": 5.0,
126
+ "grad_norm": 55.58554458618164,
127
+ "learning_rate": 1.5432914190872757e-05,
128
+ "loss": 0.7115,
129
+ "num_input_tokens_seen": 706680,
130
+ "step": 75
131
+ },
132
+ {
133
+ "epoch": 5.333333333333333,
134
+ "grad_norm": 7.757297992706299,
135
+ "learning_rate": 1.2500000000000006e-05,
136
+ "loss": 0.1582,
137
+ "num_input_tokens_seen": 754048,
138
+ "step": 80
139
+ },
140
+ {
141
+ "epoch": 5.666666666666667,
142
+ "grad_norm": 6.108384609222412,
143
+ "learning_rate": 9.780964274781984e-06,
144
+ "loss": 0.1891,
145
+ "num_input_tokens_seen": 800984,
146
+ "step": 85
147
+ },
148
+ {
149
+ "epoch": 6.0,
150
+ "grad_norm": 0.9440665245056152,
151
+ "learning_rate": 7.3223304703363135e-06,
152
+ "loss": 0.2214,
153
+ "num_input_tokens_seen": 848016,
154
+ "step": 90
155
+ },
156
+ {
157
+ "epoch": 6.333333333333333,
158
+ "grad_norm": 0.9317387342453003,
159
+ "learning_rate": 5.166166492719124e-06,
160
+ "loss": 0.0135,
161
+ "num_input_tokens_seen": 895040,
162
+ "step": 95
163
+ },
164
+ {
165
+ "epoch": 6.666666666666667,
166
+ "grad_norm": 0.1581590473651886,
167
+ "learning_rate": 3.3493649053890326e-06,
168
+ "loss": 0.0055,
169
+ "num_input_tokens_seen": 941960,
170
+ "step": 100
171
+ },
172
+ {
173
+ "epoch": 7.0,
174
+ "grad_norm": 1.00675368309021,
175
+ "learning_rate": 1.9030116872178316e-06,
176
+ "loss": 0.031,
177
+ "num_input_tokens_seen": 989352,
178
+ "step": 105
179
+ },
180
+ {
181
+ "epoch": 7.333333333333333,
182
+ "grad_norm": 0.6310182213783264,
183
+ "learning_rate": 8.51854342773295e-07,
184
+ "loss": 0.0085,
185
+ "num_input_tokens_seen": 1035760,
186
+ "step": 110
187
+ },
188
+ {
189
+ "epoch": 7.666666666666667,
190
+ "grad_norm": 0.24195235967636108,
191
+ "learning_rate": 2.1387846565474045e-07,
192
+ "loss": 0.0032,
193
+ "num_input_tokens_seen": 1083624,
194
+ "step": 115
195
+ },
196
+ {
197
+ "epoch": 8.0,
198
+ "grad_norm": 0.020462460815906525,
199
+ "learning_rate": 0.0,
200
+ "loss": 0.0005,
201
+ "num_input_tokens_seen": 1130688,
202
+ "step": 120
203
+ }
204
+ ],
205
+ "logging_steps": 5,
206
+ "max_steps": 120,
207
+ "num_input_tokens_seen": 1130688,
208
+ "num_train_epochs": 8,
209
+ "save_steps": 100,
210
+ "stateful_callbacks": {
211
+ "TrainerControl": {
212
+ "args": {
213
+ "should_epoch_stop": false,
214
+ "should_evaluate": false,
215
+ "should_log": false,
216
+ "should_save": true,
217
+ "should_training_stop": true
218
+ },
219
+ "attributes": {}
220
+ }
221
+ },
222
+ "total_flos": 1.7959027775641805e+17,
223
+ "train_batch_size": 1,
224
+ "trial_name": null,
225
+ "trial_params": null
226
+ }
checkpoint-120/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ef6c21cfca5f2537c2c77045da591ab94a70a2c3c6f8193b51ea235f03d35bf
3
+ size 5688
llamaboard_config.yaml ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ top.booster: auto
2
+ top.checkpoint_path: []
3
+ top.finetuning_type: lora
4
+ top.model_name: Gemma-3-27B-Instruct
5
+ top.quantization_bit: none
6
+ top.quantization_method: bitsandbytes
7
+ top.rope_scaling: none
8
+ top.template: gemma3
9
+ train.additional_target: ''
10
+ train.apollo_rank: 16
11
+ train.apollo_scale: 32
12
+ train.apollo_target: all
13
+ train.apollo_update_interval: 200
14
+ train.badam_mode: layer
15
+ train.badam_switch_interval: 50
16
+ train.badam_switch_mode: ascending
17
+ train.badam_update_ratio: 0.05
18
+ train.batch_size: 1
19
+ train.compute_type: bf16
20
+ train.create_new_adapter: false
21
+ train.cutoff_len: 2048
22
+ train.dataset:
23
+ - ets120
24
+ train.dataset_dir: data
25
+ train.ds_offload: false
26
+ train.ds_stage: none
27
+ train.extra_args: '{"optim": "adamw_torch"}'
28
+ train.freeze_extra_modules: ''
29
+ train.freeze_trainable_layers: 2
30
+ train.freeze_trainable_modules: all
31
+ train.galore_rank: 16
32
+ train.galore_scale: 2
33
+ train.galore_target: all
34
+ train.galore_update_interval: 200
35
+ train.gradient_accumulation_steps: 8
36
+ train.learning_rate: 5e-5
37
+ train.logging_steps: 5
38
+ train.lora_alpha: 128
39
+ train.lora_dropout: 0
40
+ train.lora_rank: 64
41
+ train.lora_target: ''
42
+ train.loraplus_lr_ratio: 0
43
+ train.lr_scheduler_type: cosine
44
+ train.mask_history: false
45
+ train.max_grad_norm: '1.0'
46
+ train.max_samples: '100000'
47
+ train.neat_packing: false
48
+ train.neftune_alpha: 0
49
+ train.num_train_epochs: '8'
50
+ train.packing: false
51
+ train.ppo_score_norm: false
52
+ train.ppo_whiten_rewards: false
53
+ train.pref_beta: 0.1
54
+ train.pref_ftx: 0
55
+ train.pref_loss: sigmoid
56
+ train.report_to:
57
+ - none
58
+ train.resize_vocab: false
59
+ train.reward_model: []
60
+ train.save_steps: 100
61
+ train.swanlab_api_key: ''
62
+ train.swanlab_link: ''
63
+ train.swanlab_mode: cloud
64
+ train.swanlab_project: llamafactory
65
+ train.swanlab_run_name: ''
66
+ train.swanlab_workspace: ''
67
+ train.train_on_prompt: false
68
+ train.training_stage: Supervised Fine-Tuning
69
+ train.use_apollo: false
70
+ train.use_badam: false
71
+ train.use_dora: false
72
+ train.use_galore: false
73
+ train.use_llama_pro: false
74
+ train.use_pissa: false
75
+ train.use_rslora: false
76
+ train.use_swanlab: false
77
+ train.val_size: 0
78
+ train.warmup_steps: 0
preprocessor_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_rgb": null,
3
+ "do_normalize": true,
4
+ "do_pan_and_scan": null,
5
+ "do_rescale": true,
6
+ "do_resize": true,
7
+ "image_mean": [
8
+ 0.5,
9
+ 0.5,
10
+ 0.5
11
+ ],
12
+ "image_processor_type": "Gemma3ImageProcessor",
13
+ "image_seq_length": 256,
14
+ "image_std": [
15
+ 0.5,
16
+ 0.5,
17
+ 0.5
18
+ ],
19
+ "pan_and_scan_max_num_crops": null,
20
+ "pan_and_scan_min_crop_size": null,
21
+ "pan_and_scan_min_ratio_to_activate": null,
22
+ "processor_class": "Gemma3Processor",
23
+ "resample": 2,
24
+ "rescale_factor": 0.00392156862745098,
25
+ "size": {
26
+ "height": 896,
27
+ "width": 896
28
+ }
29
+ }
processor_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "image_seq_length": 256,
3
+ "processor_class": "Gemma3Processor"
4
+ }
running_log.txt ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<end_of_turn>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ }
10
+ ],
11
+ "boi_token": "<start_of_image>",
12
+ "bos_token": {
13
+ "content": "<bos>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false
18
+ },
19
+ "eoi_token": "<end_of_image>",
20
+ "eos_token": {
21
+ "content": "<eos>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ },
27
+ "image_token": "<image_soft_token>",
28
+ "pad_token": {
29
+ "content": "<pad>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false
34
+ },
35
+ "unk_token": {
36
+ "content": "<unk>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false
41
+ }
42
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4667f2089529e8e7657cfb6d1c19910ae71ff5f28aa7ab2ff2763330affad795
3
+ size 33384568
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c
3
+ size 4689074
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
train_results.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 8.0,
3
+ "num_input_tokens_seen": 1130688,
4
+ "total_flos": 1.7959027775641805e+17,
5
+ "train_loss": 2.86213872662629,
6
+ "train_runtime": 778.6794,
7
+ "train_samples_per_second": 1.233,
8
+ "train_steps_per_second": 0.154
9
+ }
trainer_log.jsonl ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"current_steps": 5, "total_steps": 120, "loss": 45.253, "lr": 4.9786121534345265e-05, "epoch": 0.3333333333333333, "percentage": 4.17, "elapsed_time": "0:00:32", "remaining_time": "0:12:33", "throughput": 1449.55, "total_tokens": 47512}
2
+ {"current_steps": 10, "total_steps": 120, "loss": 3.0254, "lr": 4.914814565722671e-05, "epoch": 0.6666666666666666, "percentage": 8.33, "elapsed_time": "0:01:04", "remaining_time": "0:11:50", "throughput": 1463.37, "total_tokens": 94536}
3
+ {"current_steps": 15, "total_steps": 120, "loss": 3.1619, "lr": 4.8096988312782174e-05, "epoch": 1.0, "percentage": 12.5, "elapsed_time": "0:01:36", "remaining_time": "0:11:13", "throughput": 1468.19, "total_tokens": 141336}
4
+ {"current_steps": 20, "total_steps": 120, "loss": 1.7871, "lr": 4.665063509461097e-05, "epoch": 1.3333333333333333, "percentage": 16.67, "elapsed_time": "0:02:08", "remaining_time": "0:10:41", "throughput": 1469.09, "total_tokens": 188368}
5
+ {"current_steps": 25, "total_steps": 120, "loss": 2.643, "lr": 4.4833833507280884e-05, "epoch": 1.6666666666666665, "percentage": 20.83, "elapsed_time": "0:02:39", "remaining_time": "0:10:07", "throughput": 1471.42, "total_tokens": 235128}
6
+ {"current_steps": 30, "total_steps": 120, "loss": 2.3801, "lr": 4.267766952966369e-05, "epoch": 2.0, "percentage": 25.0, "elapsed_time": "0:03:11", "remaining_time": "0:09:35", "throughput": 1472.69, "total_tokens": 282672}
7
+ {"current_steps": 35, "total_steps": 120, "loss": 1.6999, "lr": 4.021903572521802e-05, "epoch": 2.3333333333333335, "percentage": 29.17, "elapsed_time": "0:03:43", "remaining_time": "0:09:03", "throughput": 1473.48, "total_tokens": 329864}
8
+ {"current_steps": 40, "total_steps": 120, "loss": 1.5571, "lr": 3.7500000000000003e-05, "epoch": 2.6666666666666665, "percentage": 33.33, "elapsed_time": "0:04:16", "remaining_time": "0:08:32", "throughput": 1474.26, "total_tokens": 377952}
9
+ {"current_steps": 45, "total_steps": 120, "loss": 1.7335, "lr": 3.456708580912725e-05, "epoch": 3.0, "percentage": 37.5, "elapsed_time": "0:04:47", "remaining_time": "0:07:59", "throughput": 1474.37, "total_tokens": 424008}
10
+ {"current_steps": 50, "total_steps": 120, "loss": 1.1859, "lr": 3.147047612756302e-05, "epoch": 3.3333333333333335, "percentage": 41.67, "elapsed_time": "0:05:19", "remaining_time": "0:07:27", "throughput": 1474.56, "total_tokens": 471344}
11
+ {"current_steps": 55, "total_steps": 120, "loss": 0.9846, "lr": 2.8263154805501297e-05, "epoch": 3.6666666666666665, "percentage": 45.83, "elapsed_time": "0:05:51", "remaining_time": "0:06:54", "throughput": 1474.79, "total_tokens": 517832}
12
+ {"current_steps": 60, "total_steps": 120, "loss": 0.901, "lr": 2.5e-05, "epoch": 4.0, "percentage": 50.0, "elapsed_time": "0:06:23", "remaining_time": "0:06:23", "throughput": 1475.6, "total_tokens": 565344}
13
+ {"current_steps": 65, "total_steps": 120, "loss": 0.4142, "lr": 2.173684519449872e-05, "epoch": 4.333333333333333, "percentage": 54.17, "elapsed_time": "0:06:54", "remaining_time": "0:05:51", "throughput": 1475.83, "total_tokens": 612336}
14
+ {"current_steps": 70, "total_steps": 120, "loss": 0.6221, "lr": 1.852952387243698e-05, "epoch": 4.666666666666667, "percentage": 58.33, "elapsed_time": "0:07:26", "remaining_time": "0:05:19", "throughput": 1476.44, "total_tokens": 659560}
15
+ {"current_steps": 75, "total_steps": 120, "loss": 0.7115, "lr": 1.5432914190872757e-05, "epoch": 5.0, "percentage": 62.5, "elapsed_time": "0:07:58", "remaining_time": "0:04:47", "throughput": 1476.17, "total_tokens": 706680}
16
+ {"current_steps": 80, "total_steps": 120, "loss": 0.1582, "lr": 1.2500000000000006e-05, "epoch": 5.333333333333333, "percentage": 66.67, "elapsed_time": "0:08:30", "remaining_time": "0:04:15", "throughput": 1476.06, "total_tokens": 754048}
17
+ {"current_steps": 85, "total_steps": 120, "loss": 0.1891, "lr": 9.780964274781984e-06, "epoch": 5.666666666666667, "percentage": 70.83, "elapsed_time": "0:09:02", "remaining_time": "0:03:43", "throughput": 1476.33, "total_tokens": 800984}
18
+ {"current_steps": 90, "total_steps": 120, "loss": 0.2214, "lr": 7.3223304703363135e-06, "epoch": 6.0, "percentage": 75.0, "elapsed_time": "0:09:34", "remaining_time": "0:03:11", "throughput": 1476.67, "total_tokens": 848016}
19
+ {"current_steps": 95, "total_steps": 120, "loss": 0.0135, "lr": 5.166166492719124e-06, "epoch": 6.333333333333333, "percentage": 79.17, "elapsed_time": "0:10:06", "remaining_time": "0:02:39", "throughput": 1476.53, "total_tokens": 895040}
20
+ {"current_steps": 100, "total_steps": 120, "loss": 0.0055, "lr": 3.3493649053890326e-06, "epoch": 6.666666666666667, "percentage": 83.33, "elapsed_time": "0:10:37", "remaining_time": "0:02:07", "throughput": 1476.74, "total_tokens": 941960}
21
+ {"current_steps": 105, "total_steps": 120, "loss": 0.031, "lr": 1.9030116872178316e-06, "epoch": 7.0, "percentage": 87.5, "elapsed_time": "0:11:16", "remaining_time": "0:01:36", "throughput": 1462.15, "total_tokens": 989352}
22
+ {"current_steps": 110, "total_steps": 120, "loss": 0.0085, "lr": 8.51854342773295e-07, "epoch": 7.333333333333333, "percentage": 91.67, "elapsed_time": "0:11:48", "remaining_time": "0:01:04", "throughput": 1462.44, "total_tokens": 1035760}
23
+ {"current_steps": 115, "total_steps": 120, "loss": 0.0032, "lr": 2.1387846565474045e-07, "epoch": 7.666666666666667, "percentage": 95.83, "elapsed_time": "0:12:20", "remaining_time": "0:00:32", "throughput": 1463.42, "total_tokens": 1083624}
24
+ {"current_steps": 120, "total_steps": 120, "loss": 0.0005, "lr": 0.0, "epoch": 8.0, "percentage": 100.0, "elapsed_time": "0:12:52", "remaining_time": "0:00:00", "throughput": 1464.1, "total_tokens": 1130688}
25
+ {"current_steps": 120, "total_steps": 120, "epoch": 8.0, "percentage": 100.0, "elapsed_time": "0:12:58", "remaining_time": "0:00:00", "throughput": 1452.07, "total_tokens": 1130688}
trainer_state.json ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 8.0,
6
+ "eval_steps": 500,
7
+ "global_step": 120,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.3333333333333333,
14
+ "grad_norm": 79.55762481689453,
15
+ "learning_rate": 4.9786121534345265e-05,
16
+ "loss": 45.253,
17
+ "num_input_tokens_seen": 47512,
18
+ "step": 5
19
+ },
20
+ {
21
+ "epoch": 0.6666666666666666,
22
+ "grad_norm": 17.002260208129883,
23
+ "learning_rate": 4.914814565722671e-05,
24
+ "loss": 3.0254,
25
+ "num_input_tokens_seen": 94536,
26
+ "step": 10
27
+ },
28
+ {
29
+ "epoch": 1.0,
30
+ "grad_norm": 35.45703887939453,
31
+ "learning_rate": 4.8096988312782174e-05,
32
+ "loss": 3.1619,
33
+ "num_input_tokens_seen": 141336,
34
+ "step": 15
35
+ },
36
+ {
37
+ "epoch": 1.3333333333333333,
38
+ "grad_norm": 32.79413604736328,
39
+ "learning_rate": 4.665063509461097e-05,
40
+ "loss": 1.7871,
41
+ "num_input_tokens_seen": 188368,
42
+ "step": 20
43
+ },
44
+ {
45
+ "epoch": 1.6666666666666665,
46
+ "grad_norm": 57.80039978027344,
47
+ "learning_rate": 4.4833833507280884e-05,
48
+ "loss": 2.643,
49
+ "num_input_tokens_seen": 235128,
50
+ "step": 25
51
+ },
52
+ {
53
+ "epoch": 2.0,
54
+ "grad_norm": 8.35395622253418,
55
+ "learning_rate": 4.267766952966369e-05,
56
+ "loss": 2.3801,
57
+ "num_input_tokens_seen": 282672,
58
+ "step": 30
59
+ },
60
+ {
61
+ "epoch": 2.3333333333333335,
62
+ "grad_norm": 13.160042762756348,
63
+ "learning_rate": 4.021903572521802e-05,
64
+ "loss": 1.6999,
65
+ "num_input_tokens_seen": 329864,
66
+ "step": 35
67
+ },
68
+ {
69
+ "epoch": 2.6666666666666665,
70
+ "grad_norm": 31.193811416625977,
71
+ "learning_rate": 3.7500000000000003e-05,
72
+ "loss": 1.5571,
73
+ "num_input_tokens_seen": 377952,
74
+ "step": 40
75
+ },
76
+ {
77
+ "epoch": 3.0,
78
+ "grad_norm": 57.113609313964844,
79
+ "learning_rate": 3.456708580912725e-05,
80
+ "loss": 1.7335,
81
+ "num_input_tokens_seen": 424008,
82
+ "step": 45
83
+ },
84
+ {
85
+ "epoch": 3.3333333333333335,
86
+ "grad_norm": 26.844955444335938,
87
+ "learning_rate": 3.147047612756302e-05,
88
+ "loss": 1.1859,
89
+ "num_input_tokens_seen": 471344,
90
+ "step": 50
91
+ },
92
+ {
93
+ "epoch": 3.6666666666666665,
94
+ "grad_norm": 19.28535270690918,
95
+ "learning_rate": 2.8263154805501297e-05,
96
+ "loss": 0.9846,
97
+ "num_input_tokens_seen": 517832,
98
+ "step": 55
99
+ },
100
+ {
101
+ "epoch": 4.0,
102
+ "grad_norm": 16.303863525390625,
103
+ "learning_rate": 2.5e-05,
104
+ "loss": 0.901,
105
+ "num_input_tokens_seen": 565344,
106
+ "step": 60
107
+ },
108
+ {
109
+ "epoch": 4.333333333333333,
110
+ "grad_norm": 38.277183532714844,
111
+ "learning_rate": 2.173684519449872e-05,
112
+ "loss": 0.4142,
113
+ "num_input_tokens_seen": 612336,
114
+ "step": 65
115
+ },
116
+ {
117
+ "epoch": 4.666666666666667,
118
+ "grad_norm": 16.51125144958496,
119
+ "learning_rate": 1.852952387243698e-05,
120
+ "loss": 0.6221,
121
+ "num_input_tokens_seen": 659560,
122
+ "step": 70
123
+ },
124
+ {
125
+ "epoch": 5.0,
126
+ "grad_norm": 55.58554458618164,
127
+ "learning_rate": 1.5432914190872757e-05,
128
+ "loss": 0.7115,
129
+ "num_input_tokens_seen": 706680,
130
+ "step": 75
131
+ },
132
+ {
133
+ "epoch": 5.333333333333333,
134
+ "grad_norm": 7.757297992706299,
135
+ "learning_rate": 1.2500000000000006e-05,
136
+ "loss": 0.1582,
137
+ "num_input_tokens_seen": 754048,
138
+ "step": 80
139
+ },
140
+ {
141
+ "epoch": 5.666666666666667,
142
+ "grad_norm": 6.108384609222412,
143
+ "learning_rate": 9.780964274781984e-06,
144
+ "loss": 0.1891,
145
+ "num_input_tokens_seen": 800984,
146
+ "step": 85
147
+ },
148
+ {
149
+ "epoch": 6.0,
150
+ "grad_norm": 0.9440665245056152,
151
+ "learning_rate": 7.3223304703363135e-06,
152
+ "loss": 0.2214,
153
+ "num_input_tokens_seen": 848016,
154
+ "step": 90
155
+ },
156
+ {
157
+ "epoch": 6.333333333333333,
158
+ "grad_norm": 0.9317387342453003,
159
+ "learning_rate": 5.166166492719124e-06,
160
+ "loss": 0.0135,
161
+ "num_input_tokens_seen": 895040,
162
+ "step": 95
163
+ },
164
+ {
165
+ "epoch": 6.666666666666667,
166
+ "grad_norm": 0.1581590473651886,
167
+ "learning_rate": 3.3493649053890326e-06,
168
+ "loss": 0.0055,
169
+ "num_input_tokens_seen": 941960,
170
+ "step": 100
171
+ },
172
+ {
173
+ "epoch": 7.0,
174
+ "grad_norm": 1.00675368309021,
175
+ "learning_rate": 1.9030116872178316e-06,
176
+ "loss": 0.031,
177
+ "num_input_tokens_seen": 989352,
178
+ "step": 105
179
+ },
180
+ {
181
+ "epoch": 7.333333333333333,
182
+ "grad_norm": 0.6310182213783264,
183
+ "learning_rate": 8.51854342773295e-07,
184
+ "loss": 0.0085,
185
+ "num_input_tokens_seen": 1035760,
186
+ "step": 110
187
+ },
188
+ {
189
+ "epoch": 7.666666666666667,
190
+ "grad_norm": 0.24195235967636108,
191
+ "learning_rate": 2.1387846565474045e-07,
192
+ "loss": 0.0032,
193
+ "num_input_tokens_seen": 1083624,
194
+ "step": 115
195
+ },
196
+ {
197
+ "epoch": 8.0,
198
+ "grad_norm": 0.020462460815906525,
199
+ "learning_rate": 0.0,
200
+ "loss": 0.0005,
201
+ "num_input_tokens_seen": 1130688,
202
+ "step": 120
203
+ },
204
+ {
205
+ "epoch": 8.0,
206
+ "num_input_tokens_seen": 1130688,
207
+ "step": 120,
208
+ "total_flos": 1.7959027775641805e+17,
209
+ "train_loss": 2.86213872662629,
210
+ "train_runtime": 778.6794,
211
+ "train_samples_per_second": 1.233,
212
+ "train_steps_per_second": 0.154
213
+ }
214
+ ],
215
+ "logging_steps": 5,
216
+ "max_steps": 120,
217
+ "num_input_tokens_seen": 1130688,
218
+ "num_train_epochs": 8,
219
+ "save_steps": 100,
220
+ "stateful_callbacks": {
221
+ "TrainerControl": {
222
+ "args": {
223
+ "should_epoch_stop": false,
224
+ "should_evaluate": false,
225
+ "should_log": false,
226
+ "should_save": true,
227
+ "should_training_stop": true
228
+ },
229
+ "attributes": {}
230
+ }
231
+ },
232
+ "total_flos": 1.7959027775641805e+17,
233
+ "train_batch_size": 1,
234
+ "trial_name": null,
235
+ "trial_params": null
236
+ }