SidhaarthMurali commited on
Commit
ee57d50
·
verified ·
1 Parent(s): 4e1c285

Upload adapter model

Browse files
README.md CHANGED
@@ -7,18 +7,18 @@ tags:
7
  - lora
8
  - generated_from_trainer
9
  model-index:
10
- - name: sft
11
  results: []
12
  ---
13
 
14
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
  should probably proofread and complete it, then remove this comment. -->
16
 
17
- # sft
18
 
19
  This model is a fine-tuned version of [meta-llama/Llama-3.2-3B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) on the identity and the gsm8k_ig datasets.
20
  It achieves the following results on the evaluation set:
21
- - Loss: 0.2039
22
 
23
  ## Model description
24
 
 
7
  - lora
8
  - generated_from_trainer
9
  model-index:
10
+ - name: gsm8k
11
  results: []
12
  ---
13
 
14
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
  should probably proofread and complete it, then remove this comment. -->
16
 
17
+ # gsm8k
18
 
19
  This model is a fine-tuned version of [meta-llama/Llama-3.2-3B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) on the identity and the gsm8k_ig datasets.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 0.4402
22
 
23
  ## Model description
24
 
adapter_config.json CHANGED
@@ -20,13 +20,13 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "gate_proj",
24
- "v_proj",
25
  "up_proj",
26
- "down_proj",
27
  "k_proj",
28
  "q_proj",
29
- "o_proj"
 
 
 
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
 
 
23
  "up_proj",
 
24
  "k_proj",
25
  "q_proj",
26
+ "o_proj",
27
+ "v_proj",
28
+ "gate_proj",
29
+ "down_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fc1629f9904e2c64a7cdbcad0ff4445478569abc535578337ef91a5587eb2fa2
3
  size 48679352
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64cdf48bf20d36aa6b5ce0f5aedefd6dbb67e13e5ab5d9d96720c741d6b5d1cb
3
  size 48679352
all_results.json CHANGED
@@ -1,12 +1,12 @@
1
  {
2
- "epoch": 2.987755102040816,
3
- "eval_loss": 0.20389176905155182,
4
- "eval_runtime": 82.0511,
5
- "eval_samples_per_second": 1.328,
6
- "eval_steps_per_second": 1.328,
7
- "total_flos": 9841219894640640.0,
8
- "train_loss": 0.14712934611273593,
9
- "train_runtime": 6269.6713,
10
- "train_samples_per_second": 0.469,
11
- "train_steps_per_second": 0.058
12
  }
 
1
  {
2
+ "epoch": 2.984709480122324,
3
+ "eval_loss": 0.44016265869140625,
4
+ "eval_runtime": 67.0579,
5
+ "eval_samples_per_second": 1.625,
6
+ "eval_steps_per_second": 1.625,
7
+ "total_flos": 8710805828812800.0,
8
+ "train_loss": 0.3467143246384918,
9
+ "train_runtime": 5692.8921,
10
+ "train_samples_per_second": 0.517,
11
+ "train_steps_per_second": 0.064
12
  }
checkpoint-366/adapter_config.json CHANGED
@@ -20,13 +20,13 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "gate_proj",
24
- "v_proj",
25
  "up_proj",
26
- "down_proj",
27
  "k_proj",
28
  "q_proj",
29
- "o_proj"
 
 
 
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
 
 
23
  "up_proj",
 
24
  "k_proj",
25
  "q_proj",
26
+ "o_proj",
27
+ "v_proj",
28
+ "gate_proj",
29
+ "down_proj"
30
  ],
31
  "task_type": "CAUSAL_LM",
32
  "use_dora": false,
checkpoint-366/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fc1629f9904e2c64a7cdbcad0ff4445478569abc535578337ef91a5587eb2fa2
3
  size 48679352
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64cdf48bf20d36aa6b5ce0f5aedefd6dbb67e13e5ab5d9d96720c741d6b5d1cb
3
  size 48679352
checkpoint-366/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c4f08989dea061d73c0567e1997346f4620cb876fbf3a3c74ccae526132e55b5
3
  size 97585002
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:802266ebc610fbe356b9936dd577c92f8c0c85266d8bcb4fded9e1476230c203
3
  size 97585002
checkpoint-366/trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 2.987755102040816,
5
  "eval_steps": 500,
6
  "global_step": 366,
7
  "is_hyper_param_search": false,
@@ -9,255 +9,255 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.08163265306122448,
13
- "grad_norm": 0.4708639085292816,
14
  "learning_rate": 2.702702702702703e-05,
15
- "loss": 0.5385,
16
  "step": 10
17
  },
18
  {
19
- "epoch": 0.16326530612244897,
20
- "grad_norm": 0.4191964268684387,
21
  "learning_rate": 5.405405405405406e-05,
22
- "loss": 0.4991,
23
  "step": 20
24
  },
25
  {
26
- "epoch": 0.24489795918367346,
27
- "grad_norm": 0.40369561314582825,
28
  "learning_rate": 8.108108108108109e-05,
29
- "loss": 0.3058,
30
  "step": 30
31
  },
32
  {
33
- "epoch": 0.32653061224489793,
34
- "grad_norm": 0.6080173850059509,
35
  "learning_rate": 9.997948550797227e-05,
36
- "loss": 0.223,
37
  "step": 40
38
  },
39
  {
40
- "epoch": 0.40816326530612246,
41
- "grad_norm": 0.3516406714916229,
42
  "learning_rate": 9.961525153583327e-05,
43
- "loss": 0.1966,
44
  "step": 50
45
  },
46
  {
47
- "epoch": 0.4897959183673469,
48
- "grad_norm": 0.3611922264099121,
49
  "learning_rate": 9.879896064123961e-05,
50
- "loss": 0.1635,
51
  "step": 60
52
  },
53
  {
54
- "epoch": 0.5714285714285714,
55
- "grad_norm": 0.3673703372478485,
56
  "learning_rate": 9.753805025397779e-05,
57
- "loss": 0.1547,
58
  "step": 70
59
  },
60
  {
61
- "epoch": 0.6530612244897959,
62
- "grad_norm": 0.31258246302604675,
63
  "learning_rate": 9.584400884284545e-05,
64
- "loss": 0.141,
65
  "step": 80
66
  },
67
  {
68
- "epoch": 0.7346938775510204,
69
- "grad_norm": 0.40288954973220825,
70
  "learning_rate": 9.373227124134888e-05,
71
- "loss": 0.1343,
72
  "step": 90
73
  },
74
  {
75
- "epoch": 0.8163265306122449,
76
- "grad_norm": 0.36042073369026184,
77
  "learning_rate": 9.122207801708802e-05,
78
- "loss": 0.1552,
79
  "step": 100
80
  },
81
  {
82
- "epoch": 0.8979591836734694,
83
- "grad_norm": 0.33795708417892456,
84
  "learning_rate": 8.833630016614976e-05,
85
- "loss": 0.1341,
86
  "step": 110
87
  },
88
  {
89
- "epoch": 0.9795918367346939,
90
- "grad_norm": 0.2802504003047943,
91
  "learning_rate": 8.510123072976239e-05,
92
- "loss": 0.1361,
93
  "step": 120
94
  },
95
  {
96
- "epoch": 1.0612244897959184,
97
- "grad_norm": 0.3562999367713928,
98
  "learning_rate": 8.154634523184388e-05,
99
- "loss": 0.1554,
100
  "step": 130
101
  },
102
  {
103
- "epoch": 1.1428571428571428,
104
- "grad_norm": 0.3551533818244934,
105
  "learning_rate": 7.770403312015721e-05,
106
- "loss": 0.1222,
107
  "step": 140
108
  },
109
  {
110
- "epoch": 1.2244897959183674,
111
- "grad_norm": 0.3744591474533081,
112
  "learning_rate": 7.360930265797935e-05,
113
- "loss": 0.1268,
114
  "step": 150
115
  },
116
  {
117
- "epoch": 1.306122448979592,
118
- "grad_norm": 0.3726373016834259,
119
  "learning_rate": 6.929946195508932e-05,
120
- "loss": 0.0923,
121
  "step": 160
122
  },
123
  {
124
- "epoch": 1.3877551020408163,
125
- "grad_norm": 0.45406657457351685,
126
  "learning_rate": 6.481377904428171e-05,
127
- "loss": 0.1236,
128
  "step": 170
129
  },
130
  {
131
- "epoch": 1.469387755102041,
132
- "grad_norm": 0.26440876722335815,
133
  "learning_rate": 6.019312410053286e-05,
134
- "loss": 0.1059,
135
  "step": 180
136
  },
137
  {
138
- "epoch": 1.5510204081632653,
139
- "grad_norm": 0.3584797978401184,
140
  "learning_rate": 5.547959706265068e-05,
141
- "loss": 0.13,
142
  "step": 190
143
  },
144
  {
145
- "epoch": 1.6326530612244898,
146
- "grad_norm": 0.2686370611190796,
147
  "learning_rate": 5.0716144050239375e-05,
148
- "loss": 0.1132,
149
  "step": 200
150
  },
151
  {
152
- "epoch": 1.7142857142857144,
153
- "grad_norm": 0.29573601484298706,
154
  "learning_rate": 4.594616607090028e-05,
155
- "loss": 0.1093,
156
  "step": 210
157
  },
158
  {
159
- "epoch": 1.7959183673469388,
160
- "grad_norm": 0.33462539315223694,
161
  "learning_rate": 4.121312358283463e-05,
162
- "loss": 0.1014,
163
  "step": 220
164
  },
165
  {
166
- "epoch": 1.8775510204081631,
167
- "grad_norm": 0.40894749760627747,
168
  "learning_rate": 3.656014051577713e-05,
169
- "loss": 0.1038,
170
  "step": 230
171
  },
172
  {
173
- "epoch": 1.9591836734693877,
174
- "grad_norm": 0.30836549401283264,
175
  "learning_rate": 3.202961135812437e-05,
176
- "loss": 0.1088,
177
  "step": 240
178
  },
179
  {
180
- "epoch": 2.0408163265306123,
181
- "grad_norm": 0.3176974654197693,
182
  "learning_rate": 2.7662814890184818e-05,
183
- "loss": 0.1055,
184
  "step": 250
185
  },
186
  {
187
- "epoch": 2.122448979591837,
188
- "grad_norm": 0.2630854845046997,
189
  "learning_rate": 2.3499538082923606e-05,
190
- "loss": 0.0942,
191
  "step": 260
192
  },
193
  {
194
- "epoch": 2.204081632653061,
195
- "grad_norm": 0.3280630111694336,
196
  "learning_rate": 1.9577713588953795e-05,
197
- "loss": 0.0954,
198
  "step": 270
199
  },
200
  {
201
- "epoch": 2.2857142857142856,
202
- "grad_norm": 0.2787545621395111,
203
  "learning_rate": 1.5933074128684332e-05,
204
- "loss": 0.0855,
205
  "step": 280
206
  },
207
  {
208
- "epoch": 2.36734693877551,
209
- "grad_norm": 0.3742433190345764,
210
  "learning_rate": 1.2598826920598772e-05,
211
- "loss": 0.1161,
212
  "step": 290
213
  },
214
  {
215
- "epoch": 2.4489795918367347,
216
- "grad_norm": 0.2906797528266907,
217
  "learning_rate": 9.605351122011309e-06,
218
- "loss": 0.0914,
219
  "step": 300
220
  },
221
  {
222
- "epoch": 2.5306122448979593,
223
- "grad_norm": 0.3414059579372406,
224
  "learning_rate": 6.979921036993042e-06,
225
- "loss": 0.0971,
226
  "step": 310
227
  },
228
  {
229
- "epoch": 2.612244897959184,
230
- "grad_norm": 0.2982410490512848,
231
  "learning_rate": 4.746457613389904e-06,
232
- "loss": 0.0855,
233
  "step": 320
234
  },
235
  {
236
- "epoch": 2.693877551020408,
237
- "grad_norm": 0.3430004417896271,
238
  "learning_rate": 2.925310493105099e-06,
239
- "loss": 0.0928,
240
  "step": 330
241
  },
242
  {
243
- "epoch": 2.7755102040816326,
244
- "grad_norm": 0.4764673709869385,
245
  "learning_rate": 1.5330726014397668e-06,
246
- "loss": 0.1081,
247
  "step": 340
248
  },
249
  {
250
- "epoch": 2.857142857142857,
251
- "grad_norm": 0.3845561742782593,
252
  "learning_rate": 5.824289648152126e-07,
253
- "loss": 0.0895,
254
  "step": 350
255
  },
256
  {
257
- "epoch": 2.938775510204082,
258
- "grad_norm": 0.3539472818374634,
259
  "learning_rate": 8.204113433559201e-08,
260
- "loss": 0.0974,
261
  "step": 360
262
  }
263
  ],
@@ -278,7 +278,7 @@
278
  "attributes": {}
279
  }
280
  },
281
- "total_flos": 9841219894640640.0,
282
  "train_batch_size": 1,
283
  "trial_name": null,
284
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 2.984709480122324,
5
  "eval_steps": 500,
6
  "global_step": 366,
7
  "is_hyper_param_search": false,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.08154943934760449,
13
+ "grad_norm": 0.660578191280365,
14
  "learning_rate": 2.702702702702703e-05,
15
+ "loss": 0.7127,
16
  "step": 10
17
  },
18
  {
19
+ "epoch": 0.16309887869520898,
20
+ "grad_norm": 0.7271870374679565,
21
  "learning_rate": 5.405405405405406e-05,
22
+ "loss": 0.7168,
23
  "step": 20
24
  },
25
  {
26
+ "epoch": 0.24464831804281345,
27
+ "grad_norm": 0.7715198993682861,
28
  "learning_rate": 8.108108108108109e-05,
29
+ "loss": 0.5257,
30
  "step": 30
31
  },
32
  {
33
+ "epoch": 0.32619775739041795,
34
+ "grad_norm": 0.5615158081054688,
35
  "learning_rate": 9.997948550797227e-05,
36
+ "loss": 0.4019,
37
  "step": 40
38
  },
39
  {
40
+ "epoch": 0.4077471967380224,
41
+ "grad_norm": 0.4358239769935608,
42
  "learning_rate": 9.961525153583327e-05,
43
+ "loss": 0.3371,
44
  "step": 50
45
  },
46
  {
47
+ "epoch": 0.4892966360856269,
48
+ "grad_norm": 0.5554934740066528,
49
  "learning_rate": 9.879896064123961e-05,
50
+ "loss": 0.4258,
51
  "step": 60
52
  },
53
  {
54
+ "epoch": 0.5708460754332314,
55
+ "grad_norm": 0.5321815609931946,
56
  "learning_rate": 9.753805025397779e-05,
57
+ "loss": 0.375,
58
  "step": 70
59
  },
60
  {
61
+ "epoch": 0.6523955147808359,
62
+ "grad_norm": 0.42357194423675537,
63
  "learning_rate": 9.584400884284545e-05,
64
+ "loss": 0.368,
65
  "step": 80
66
  },
67
  {
68
+ "epoch": 0.7339449541284404,
69
+ "grad_norm": 0.502548098564148,
70
  "learning_rate": 9.373227124134888e-05,
71
+ "loss": 0.365,
72
  "step": 90
73
  },
74
  {
75
+ "epoch": 0.8154943934760448,
76
+ "grad_norm": 1.1985193490982056,
77
  "learning_rate": 9.122207801708802e-05,
78
+ "loss": 0.4522,
79
  "step": 100
80
  },
81
  {
82
+ "epoch": 0.8970438328236493,
83
+ "grad_norm": 0.5025231242179871,
84
  "learning_rate": 8.833630016614976e-05,
85
+ "loss": 0.3663,
86
  "step": 110
87
  },
88
  {
89
+ "epoch": 0.9785932721712538,
90
+ "grad_norm": 0.3899594247341156,
91
  "learning_rate": 8.510123072976239e-05,
92
+ "loss": 0.3597,
93
  "step": 120
94
  },
95
  {
96
+ "epoch": 1.0601427115188584,
97
+ "grad_norm": 0.4278734028339386,
98
  "learning_rate": 8.154634523184388e-05,
99
+ "loss": 0.3343,
100
  "step": 130
101
  },
102
  {
103
+ "epoch": 1.1416921508664628,
104
+ "grad_norm": 0.5638087391853333,
105
  "learning_rate": 7.770403312015721e-05,
106
+ "loss": 0.3215,
107
  "step": 140
108
  },
109
  {
110
+ "epoch": 1.2232415902140672,
111
+ "grad_norm": 0.5326379537582397,
112
  "learning_rate": 7.360930265797935e-05,
113
+ "loss": 0.3058,
114
  "step": 150
115
  },
116
  {
117
+ "epoch": 1.3047910295616718,
118
+ "grad_norm": 0.5562155842781067,
119
  "learning_rate": 6.929946195508932e-05,
120
+ "loss": 0.3273,
121
  "step": 160
122
  },
123
  {
124
+ "epoch": 1.3863404689092762,
125
+ "grad_norm": 0.6649357080459595,
126
  "learning_rate": 6.481377904428171e-05,
127
+ "loss": 0.3461,
128
  "step": 170
129
  },
130
  {
131
+ "epoch": 1.4678899082568808,
132
+ "grad_norm": 0.5380542278289795,
133
  "learning_rate": 6.019312410053286e-05,
134
+ "loss": 0.3111,
135
  "step": 180
136
  },
137
  {
138
+ "epoch": 1.5494393476044852,
139
+ "grad_norm": 0.6011327505111694,
140
  "learning_rate": 5.547959706265068e-05,
141
+ "loss": 0.287,
142
  "step": 190
143
  },
144
  {
145
+ "epoch": 1.6309887869520896,
146
+ "grad_norm": 0.5643779635429382,
147
  "learning_rate": 5.0716144050239375e-05,
148
+ "loss": 0.3202,
149
  "step": 200
150
  },
151
  {
152
+ "epoch": 1.7125382262996942,
153
+ "grad_norm": 0.5761425495147705,
154
  "learning_rate": 4.594616607090028e-05,
155
+ "loss": 0.3272,
156
  "step": 210
157
  },
158
  {
159
+ "epoch": 1.7940876656472988,
160
+ "grad_norm": 0.7082852125167847,
161
  "learning_rate": 4.121312358283463e-05,
162
+ "loss": 0.3277,
163
  "step": 220
164
  },
165
  {
166
+ "epoch": 1.8756371049949032,
167
+ "grad_norm": 0.41143250465393066,
168
  "learning_rate": 3.656014051577713e-05,
169
+ "loss": 0.3057,
170
  "step": 230
171
  },
172
  {
173
+ "epoch": 1.9571865443425076,
174
+ "grad_norm": 0.408113569021225,
175
  "learning_rate": 3.202961135812437e-05,
176
+ "loss": 0.3117,
177
  "step": 240
178
  },
179
  {
180
+ "epoch": 2.038735983690112,
181
+ "grad_norm": 0.47071775794029236,
182
  "learning_rate": 2.7662814890184818e-05,
183
+ "loss": 0.3175,
184
  "step": 250
185
  },
186
  {
187
+ "epoch": 2.120285423037717,
188
+ "grad_norm": 0.592940092086792,
189
  "learning_rate": 2.3499538082923606e-05,
190
+ "loss": 0.2854,
191
  "step": 260
192
  },
193
  {
194
+ "epoch": 2.2018348623853212,
195
+ "grad_norm": 0.569543182849884,
196
  "learning_rate": 1.9577713588953795e-05,
197
+ "loss": 0.254,
198
  "step": 270
199
  },
200
  {
201
+ "epoch": 2.2833843017329256,
202
+ "grad_norm": 0.6796756982803345,
203
  "learning_rate": 1.5933074128684332e-05,
204
+ "loss": 0.2637,
205
  "step": 280
206
  },
207
  {
208
+ "epoch": 2.36493374108053,
209
+ "grad_norm": 0.5621957182884216,
210
  "learning_rate": 1.2598826920598772e-05,
211
+ "loss": 0.2521,
212
  "step": 290
213
  },
214
  {
215
+ "epoch": 2.4464831804281344,
216
+ "grad_norm": 0.708060085773468,
217
  "learning_rate": 9.605351122011309e-06,
218
+ "loss": 0.2497,
219
  "step": 300
220
  },
221
  {
222
+ "epoch": 2.528032619775739,
223
+ "grad_norm": 0.583659827709198,
224
  "learning_rate": 6.979921036993042e-06,
225
+ "loss": 0.2822,
226
  "step": 310
227
  },
228
  {
229
+ "epoch": 2.6095820591233436,
230
+ "grad_norm": 0.6250892281532288,
231
  "learning_rate": 4.746457613389904e-06,
232
+ "loss": 0.2675,
233
  "step": 320
234
  },
235
  {
236
+ "epoch": 2.691131498470948,
237
+ "grad_norm": 0.6775749325752258,
238
  "learning_rate": 2.925310493105099e-06,
239
+ "loss": 0.2996,
240
  "step": 330
241
  },
242
  {
243
+ "epoch": 2.7726809378185524,
244
+ "grad_norm": 0.8949369192123413,
245
  "learning_rate": 1.5330726014397668e-06,
246
+ "loss": 0.3064,
247
  "step": 340
248
  },
249
  {
250
+ "epoch": 2.8542303771661572,
251
+ "grad_norm": 0.6756715178489685,
252
  "learning_rate": 5.824289648152126e-07,
253
+ "loss": 0.2577,
254
  "step": 350
255
  },
256
  {
257
+ "epoch": 2.9357798165137616,
258
+ "grad_norm": 0.4671032130718231,
259
  "learning_rate": 8.204113433559201e-08,
260
+ "loss": 0.2614,
261
  "step": 360
262
  }
263
  ],
 
278
  "attributes": {}
279
  }
280
  },
281
+ "total_flos": 8710805828812800.0,
282
  "train_batch_size": 1,
283
  "trial_name": null,
284
  "trial_params": null
checkpoint-366/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2d483a705da1c1d63c473066abd5241d98a6dac495fb8004c57333467fd5d881
3
  size 5432
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18db36ef5929943174913d64146ca832cb44d0b91ef63a7ab3fe26d3ed18fb1c
3
  size 5432
eval_results.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
- "epoch": 2.987755102040816,
3
- "eval_loss": 0.20389176905155182,
4
- "eval_runtime": 82.0511,
5
- "eval_samples_per_second": 1.328,
6
- "eval_steps_per_second": 1.328
7
  }
 
1
  {
2
+ "epoch": 2.984709480122324,
3
+ "eval_loss": 0.44016265869140625,
4
+ "eval_runtime": 67.0579,
5
+ "eval_samples_per_second": 1.625,
6
+ "eval_steps_per_second": 1.625
7
  }
runs/Dec30_16-32-59_542f969342da/events.out.tfevents.1735576597.542f969342da.2955.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5ff7a103f373be5a52518a66dce19d6f2b20275bc27141b799c752129655e6c
3
+ size 13442
runs/Dec30_16-32-59_542f969342da/events.out.tfevents.1735582358.542f969342da.2955.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aaa7716a637d4d0c9b44698fd057b8e1573cdb12da4fc2cce711c69b75eba638
3
+ size 359
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 2.987755102040816,
3
- "total_flos": 9841219894640640.0,
4
- "train_loss": 0.14712934611273593,
5
- "train_runtime": 6269.6713,
6
- "train_samples_per_second": 0.469,
7
- "train_steps_per_second": 0.058
8
  }
 
1
  {
2
+ "epoch": 2.984709480122324,
3
+ "total_flos": 8710805828812800.0,
4
+ "train_loss": 0.3467143246384918,
5
+ "train_runtime": 5692.8921,
6
+ "train_samples_per_second": 0.517,
7
+ "train_steps_per_second": 0.064
8
  }
trainer_log.jsonl CHANGED
@@ -1,37 +1,37 @@
1
- {"current_steps": 10, "total_steps": 366, "loss": 0.5385, "lr": 2.702702702702703e-05, "epoch": 0.08163265306122448, "percentage": 2.73, "elapsed_time": "0:02:51", "remaining_time": "1:41:33"}
2
- {"current_steps": 20, "total_steps": 366, "loss": 0.4991, "lr": 5.405405405405406e-05, "epoch": 0.16326530612244897, "percentage": 5.46, "elapsed_time": "0:05:37", "remaining_time": "1:37:23"}
3
- {"current_steps": 30, "total_steps": 366, "loss": 0.3058, "lr": 8.108108108108109e-05, "epoch": 0.24489795918367346, "percentage": 8.2, "elapsed_time": "0:08:26", "remaining_time": "1:34:28"}
4
- {"current_steps": 40, "total_steps": 366, "loss": 0.223, "lr": 9.997948550797227e-05, "epoch": 0.32653061224489793, "percentage": 10.93, "elapsed_time": "0:11:20", "remaining_time": "1:32:26"}
5
- {"current_steps": 50, "total_steps": 366, "loss": 0.1966, "lr": 9.961525153583327e-05, "epoch": 0.40816326530612246, "percentage": 13.66, "elapsed_time": "0:14:08", "remaining_time": "1:29:22"}
6
- {"current_steps": 60, "total_steps": 366, "loss": 0.1635, "lr": 9.879896064123961e-05, "epoch": 0.4897959183673469, "percentage": 16.39, "elapsed_time": "0:16:51", "remaining_time": "1:26:00"}
7
- {"current_steps": 70, "total_steps": 366, "loss": 0.1547, "lr": 9.753805025397779e-05, "epoch": 0.5714285714285714, "percentage": 19.13, "elapsed_time": "0:19:36", "remaining_time": "1:22:55"}
8
- {"current_steps": 80, "total_steps": 366, "loss": 0.141, "lr": 9.584400884284545e-05, "epoch": 0.6530612244897959, "percentage": 21.86, "elapsed_time": "0:22:20", "remaining_time": "1:19:52"}
9
- {"current_steps": 90, "total_steps": 366, "loss": 0.1343, "lr": 9.373227124134888e-05, "epoch": 0.7346938775510204, "percentage": 24.59, "elapsed_time": "0:25:11", "remaining_time": "1:17:16"}
10
- {"current_steps": 100, "total_steps": 366, "loss": 0.1552, "lr": 9.122207801708802e-05, "epoch": 0.8163265306122449, "percentage": 27.32, "elapsed_time": "0:27:59", "remaining_time": "1:14:27"}
11
- {"current_steps": 110, "total_steps": 366, "loss": 0.1341, "lr": 8.833630016614976e-05, "epoch": 0.8979591836734694, "percentage": 30.05, "elapsed_time": "0:30:55", "remaining_time": "1:11:59"}
12
- {"current_steps": 120, "total_steps": 366, "loss": 0.1361, "lr": 8.510123072976239e-05, "epoch": 0.9795918367346939, "percentage": 32.79, "elapsed_time": "0:33:47", "remaining_time": "1:09:16"}
13
- {"current_steps": 130, "total_steps": 366, "loss": 0.1554, "lr": 8.154634523184388e-05, "epoch": 1.0612244897959184, "percentage": 35.52, "elapsed_time": "0:36:34", "remaining_time": "1:06:23"}
14
- {"current_steps": 140, "total_steps": 366, "loss": 0.1222, "lr": 7.770403312015721e-05, "epoch": 1.1428571428571428, "percentage": 38.25, "elapsed_time": "0:39:27", "remaining_time": "1:03:41"}
15
- {"current_steps": 150, "total_steps": 366, "loss": 0.1268, "lr": 7.360930265797935e-05, "epoch": 1.2244897959183674, "percentage": 40.98, "elapsed_time": "0:42:18", "remaining_time": "1:00:55"}
16
- {"current_steps": 160, "total_steps": 366, "loss": 0.0923, "lr": 6.929946195508932e-05, "epoch": 1.306122448979592, "percentage": 43.72, "elapsed_time": "0:45:11", "remaining_time": "0:58:11"}
17
- {"current_steps": 170, "total_steps": 366, "loss": 0.1236, "lr": 6.481377904428171e-05, "epoch": 1.3877551020408163, "percentage": 46.45, "elapsed_time": "0:47:54", "remaining_time": "0:55:13"}
18
- {"current_steps": 180, "total_steps": 366, "loss": 0.1059, "lr": 6.019312410053286e-05, "epoch": 1.469387755102041, "percentage": 49.18, "elapsed_time": "0:50:43", "remaining_time": "0:52:24"}
19
- {"current_steps": 190, "total_steps": 366, "loss": 0.13, "lr": 5.547959706265068e-05, "epoch": 1.5510204081632653, "percentage": 51.91, "elapsed_time": "0:53:31", "remaining_time": "0:49:34"}
20
- {"current_steps": 200, "total_steps": 366, "loss": 0.1132, "lr": 5.0716144050239375e-05, "epoch": 1.6326530612244898, "percentage": 54.64, "elapsed_time": "0:56:29", "remaining_time": "0:46:53"}
21
- {"current_steps": 210, "total_steps": 366, "loss": 0.1093, "lr": 4.594616607090028e-05, "epoch": 1.7142857142857144, "percentage": 57.38, "elapsed_time": "0:59:10", "remaining_time": "0:43:57"}
22
- {"current_steps": 220, "total_steps": 366, "loss": 0.1014, "lr": 4.121312358283463e-05, "epoch": 1.7959183673469388, "percentage": 60.11, "elapsed_time": "1:01:56", "remaining_time": "0:41:06"}
23
- {"current_steps": 230, "total_steps": 366, "loss": 0.1038, "lr": 3.656014051577713e-05, "epoch": 1.8775510204081631, "percentage": 62.84, "elapsed_time": "1:04:49", "remaining_time": "0:38:19"}
24
- {"current_steps": 240, "total_steps": 366, "loss": 0.1088, "lr": 3.202961135812437e-05, "epoch": 1.9591836734693877, "percentage": 65.57, "elapsed_time": "1:07:34", "remaining_time": "0:35:28"}
25
- {"current_steps": 250, "total_steps": 366, "loss": 0.1055, "lr": 2.7662814890184818e-05, "epoch": 2.0408163265306123, "percentage": 68.31, "elapsed_time": "1:10:20", "remaining_time": "0:32:38"}
26
- {"current_steps": 260, "total_steps": 366, "loss": 0.0942, "lr": 2.3499538082923606e-05, "epoch": 2.122448979591837, "percentage": 71.04, "elapsed_time": "1:13:12", "remaining_time": "0:29:50"}
27
- {"current_steps": 270, "total_steps": 366, "loss": 0.0954, "lr": 1.9577713588953795e-05, "epoch": 2.204081632653061, "percentage": 73.77, "elapsed_time": "1:16:00", "remaining_time": "0:27:01"}
28
- {"current_steps": 280, "total_steps": 366, "loss": 0.0855, "lr": 1.5933074128684332e-05, "epoch": 2.2857142857142856, "percentage": 76.5, "elapsed_time": "1:18:53", "remaining_time": "0:24:13"}
29
- {"current_steps": 290, "total_steps": 366, "loss": 0.1161, "lr": 1.2598826920598772e-05, "epoch": 2.36734693877551, "percentage": 79.23, "elapsed_time": "1:21:44", "remaining_time": "0:21:25"}
30
- {"current_steps": 300, "total_steps": 366, "loss": 0.0914, "lr": 9.605351122011309e-06, "epoch": 2.4489795918367347, "percentage": 81.97, "elapsed_time": "1:24:43", "remaining_time": "0:18:38"}
31
- {"current_steps": 310, "total_steps": 366, "loss": 0.0971, "lr": 6.979921036993042e-06, "epoch": 2.5306122448979593, "percentage": 84.7, "elapsed_time": "1:27:31", "remaining_time": "0:15:48"}
32
- {"current_steps": 320, "total_steps": 366, "loss": 0.0855, "lr": 4.746457613389904e-06, "epoch": 2.612244897959184, "percentage": 87.43, "elapsed_time": "1:30:14", "remaining_time": "0:12:58"}
33
- {"current_steps": 330, "total_steps": 366, "loss": 0.0928, "lr": 2.925310493105099e-06, "epoch": 2.693877551020408, "percentage": 90.16, "elapsed_time": "1:33:03", "remaining_time": "0:10:09"}
34
- {"current_steps": 340, "total_steps": 366, "loss": 0.1081, "lr": 1.5330726014397668e-06, "epoch": 2.7755102040816326, "percentage": 92.9, "elapsed_time": "1:35:48", "remaining_time": "0:07:19"}
35
- {"current_steps": 350, "total_steps": 366, "loss": 0.0895, "lr": 5.824289648152126e-07, "epoch": 2.857142857142857, "percentage": 95.63, "elapsed_time": "1:38:31", "remaining_time": "0:04:30"}
36
- {"current_steps": 360, "total_steps": 366, "loss": 0.0974, "lr": 8.204113433559201e-08, "epoch": 2.938775510204082, "percentage": 98.36, "elapsed_time": "1:41:18", "remaining_time": "0:01:41"}
37
- {"current_steps": 366, "total_steps": 366, "epoch": 2.987755102040816, "percentage": 100.0, "elapsed_time": "1:43:01", "remaining_time": "0:00:00"}
 
1
+ {"current_steps": 10, "total_steps": 366, "loss": 0.7127, "lr": 2.702702702702703e-05, "epoch": 0.08154943934760449, "percentage": 2.73, "elapsed_time": "0:02:31", "remaining_time": "1:29:52"}
2
+ {"current_steps": 20, "total_steps": 366, "loss": 0.7168, "lr": 5.405405405405406e-05, "epoch": 0.16309887869520898, "percentage": 5.46, "elapsed_time": "0:05:02", "remaining_time": "1:27:09"}
3
+ {"current_steps": 30, "total_steps": 366, "loss": 0.5257, "lr": 8.108108108108109e-05, "epoch": 0.24464831804281345, "percentage": 8.2, "elapsed_time": "0:07:34", "remaining_time": "1:24:55"}
4
+ {"current_steps": 40, "total_steps": 366, "loss": 0.4019, "lr": 9.997948550797227e-05, "epoch": 0.32619775739041795, "percentage": 10.93, "elapsed_time": "0:10:19", "remaining_time": "1:24:12"}
5
+ {"current_steps": 50, "total_steps": 366, "loss": 0.3371, "lr": 9.961525153583327e-05, "epoch": 0.4077471967380224, "percentage": 13.66, "elapsed_time": "0:12:58", "remaining_time": "1:21:59"}
6
+ {"current_steps": 60, "total_steps": 366, "loss": 0.4258, "lr": 9.879896064123961e-05, "epoch": 0.4892966360856269, "percentage": 16.39, "elapsed_time": "0:15:19", "remaining_time": "1:18:10"}
7
+ {"current_steps": 70, "total_steps": 366, "loss": 0.375, "lr": 9.753805025397779e-05, "epoch": 0.5708460754332314, "percentage": 19.13, "elapsed_time": "0:17:49", "remaining_time": "1:15:22"}
8
+ {"current_steps": 80, "total_steps": 366, "loss": 0.368, "lr": 9.584400884284545e-05, "epoch": 0.6523955147808359, "percentage": 21.86, "elapsed_time": "0:20:35", "remaining_time": "1:13:36"}
9
+ {"current_steps": 90, "total_steps": 366, "loss": 0.365, "lr": 9.373227124134888e-05, "epoch": 0.7339449541284404, "percentage": 24.59, "elapsed_time": "0:23:13", "remaining_time": "1:11:13"}
10
+ {"current_steps": 100, "total_steps": 366, "loss": 0.4522, "lr": 9.122207801708802e-05, "epoch": 0.8154943934760448, "percentage": 27.32, "elapsed_time": "0:25:39", "remaining_time": "1:08:15"}
11
+ {"current_steps": 110, "total_steps": 366, "loss": 0.3663, "lr": 8.833630016614976e-05, "epoch": 0.8970438328236493, "percentage": 30.05, "elapsed_time": "0:28:12", "remaining_time": "1:05:38"}
12
+ {"current_steps": 120, "total_steps": 366, "loss": 0.3597, "lr": 8.510123072976239e-05, "epoch": 0.9785932721712538, "percentage": 32.79, "elapsed_time": "0:30:47", "remaining_time": "1:03:08"}
13
+ {"current_steps": 130, "total_steps": 366, "loss": 0.3343, "lr": 8.154634523184388e-05, "epoch": 1.0601427115188584, "percentage": 35.52, "elapsed_time": "0:33:23", "remaining_time": "1:00:37"}
14
+ {"current_steps": 140, "total_steps": 366, "loss": 0.3215, "lr": 7.770403312015721e-05, "epoch": 1.1416921508664628, "percentage": 38.25, "elapsed_time": "0:36:03", "remaining_time": "0:58:12"}
15
+ {"current_steps": 150, "total_steps": 366, "loss": 0.3058, "lr": 7.360930265797935e-05, "epoch": 1.2232415902140672, "percentage": 40.98, "elapsed_time": "0:38:28", "remaining_time": "0:55:24"}
16
+ {"current_steps": 160, "total_steps": 366, "loss": 0.3273, "lr": 6.929946195508932e-05, "epoch": 1.3047910295616718, "percentage": 43.72, "elapsed_time": "0:41:01", "remaining_time": "0:52:48"}
17
+ {"current_steps": 170, "total_steps": 366, "loss": 0.3461, "lr": 6.481377904428171e-05, "epoch": 1.3863404689092762, "percentage": 46.45, "elapsed_time": "0:43:40", "remaining_time": "0:50:20"}
18
+ {"current_steps": 180, "total_steps": 366, "loss": 0.3111, "lr": 6.019312410053286e-05, "epoch": 1.4678899082568808, "percentage": 49.18, "elapsed_time": "0:46:22", "remaining_time": "0:47:54"}
19
+ {"current_steps": 190, "total_steps": 366, "loss": 0.287, "lr": 5.547959706265068e-05, "epoch": 1.5494393476044852, "percentage": 51.91, "elapsed_time": "0:49:00", "remaining_time": "0:45:23"}
20
+ {"current_steps": 200, "total_steps": 366, "loss": 0.3202, "lr": 5.0716144050239375e-05, "epoch": 1.6309887869520896, "percentage": 54.64, "elapsed_time": "0:51:33", "remaining_time": "0:42:47"}
21
+ {"current_steps": 210, "total_steps": 366, "loss": 0.3272, "lr": 4.594616607090028e-05, "epoch": 1.7125382262996942, "percentage": 57.38, "elapsed_time": "0:54:10", "remaining_time": "0:40:14"}
22
+ {"current_steps": 220, "total_steps": 366, "loss": 0.3277, "lr": 4.121312358283463e-05, "epoch": 1.7940876656472988, "percentage": 60.11, "elapsed_time": "0:56:38", "remaining_time": "0:37:35"}
23
+ {"current_steps": 230, "total_steps": 366, "loss": 0.3057, "lr": 3.656014051577713e-05, "epoch": 1.8756371049949032, "percentage": 62.84, "elapsed_time": "0:59:04", "remaining_time": "0:34:55"}
24
+ {"current_steps": 240, "total_steps": 366, "loss": 0.3117, "lr": 3.202961135812437e-05, "epoch": 1.9571865443425076, "percentage": 65.57, "elapsed_time": "1:01:42", "remaining_time": "0:32:23"}
25
+ {"current_steps": 250, "total_steps": 366, "loss": 0.3175, "lr": 2.7662814890184818e-05, "epoch": 2.038735983690112, "percentage": 68.31, "elapsed_time": "1:04:22", "remaining_time": "0:29:52"}
26
+ {"current_steps": 260, "total_steps": 366, "loss": 0.2854, "lr": 2.3499538082923606e-05, "epoch": 2.120285423037717, "percentage": 71.04, "elapsed_time": "1:06:56", "remaining_time": "0:27:17"}
27
+ {"current_steps": 270, "total_steps": 366, "loss": 0.254, "lr": 1.9577713588953795e-05, "epoch": 2.2018348623853212, "percentage": 73.77, "elapsed_time": "1:09:25", "remaining_time": "0:24:41"}
28
+ {"current_steps": 280, "total_steps": 366, "loss": 0.2637, "lr": 1.5933074128684332e-05, "epoch": 2.2833843017329256, "percentage": 76.5, "elapsed_time": "1:12:06", "remaining_time": "0:22:08"}
29
+ {"current_steps": 290, "total_steps": 366, "loss": 0.2521, "lr": 1.2598826920598772e-05, "epoch": 2.36493374108053, "percentage": 79.23, "elapsed_time": "1:14:34", "remaining_time": "0:19:32"}
30
+ {"current_steps": 300, "total_steps": 366, "loss": 0.2497, "lr": 9.605351122011309e-06, "epoch": 2.4464831804281344, "percentage": 81.97, "elapsed_time": "1:17:20", "remaining_time": "0:17:00"}
31
+ {"current_steps": 310, "total_steps": 366, "loss": 0.2822, "lr": 6.979921036993042e-06, "epoch": 2.528032619775739, "percentage": 84.7, "elapsed_time": "1:19:51", "remaining_time": "0:14:25"}
32
+ {"current_steps": 320, "total_steps": 366, "loss": 0.2675, "lr": 4.746457613389904e-06, "epoch": 2.6095820591233436, "percentage": 87.43, "elapsed_time": "1:22:35", "remaining_time": "0:11:52"}
33
+ {"current_steps": 330, "total_steps": 366, "loss": 0.2996, "lr": 2.925310493105099e-06, "epoch": 2.691131498470948, "percentage": 90.16, "elapsed_time": "1:25:08", "remaining_time": "0:09:17"}
34
+ {"current_steps": 340, "total_steps": 366, "loss": 0.3064, "lr": 1.5330726014397668e-06, "epoch": 2.7726809378185524, "percentage": 92.9, "elapsed_time": "1:27:40", "remaining_time": "0:06:42"}
35
+ {"current_steps": 350, "total_steps": 366, "loss": 0.2577, "lr": 5.824289648152126e-07, "epoch": 2.8542303771661572, "percentage": 95.63, "elapsed_time": "1:30:06", "remaining_time": "0:04:07"}
36
+ {"current_steps": 360, "total_steps": 366, "loss": 0.2614, "lr": 8.204113433559201e-08, "epoch": 2.9357798165137616, "percentage": 98.36, "elapsed_time": "1:32:41", "remaining_time": "0:01:32"}
37
+ {"current_steps": 366, "total_steps": 366, "epoch": 2.984709480122324, "percentage": 100.0, "elapsed_time": "1:34:08", "remaining_time": "0:00:00"}
trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 2.987755102040816,
5
  "eval_steps": 500,
6
  "global_step": 366,
7
  "is_hyper_param_search": false,
@@ -9,265 +9,265 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.08163265306122448,
13
- "grad_norm": 0.4708639085292816,
14
  "learning_rate": 2.702702702702703e-05,
15
- "loss": 0.5385,
16
  "step": 10
17
  },
18
  {
19
- "epoch": 0.16326530612244897,
20
- "grad_norm": 0.4191964268684387,
21
  "learning_rate": 5.405405405405406e-05,
22
- "loss": 0.4991,
23
  "step": 20
24
  },
25
  {
26
- "epoch": 0.24489795918367346,
27
- "grad_norm": 0.40369561314582825,
28
  "learning_rate": 8.108108108108109e-05,
29
- "loss": 0.3058,
30
  "step": 30
31
  },
32
  {
33
- "epoch": 0.32653061224489793,
34
- "grad_norm": 0.6080173850059509,
35
  "learning_rate": 9.997948550797227e-05,
36
- "loss": 0.223,
37
  "step": 40
38
  },
39
  {
40
- "epoch": 0.40816326530612246,
41
- "grad_norm": 0.3516406714916229,
42
  "learning_rate": 9.961525153583327e-05,
43
- "loss": 0.1966,
44
  "step": 50
45
  },
46
  {
47
- "epoch": 0.4897959183673469,
48
- "grad_norm": 0.3611922264099121,
49
  "learning_rate": 9.879896064123961e-05,
50
- "loss": 0.1635,
51
  "step": 60
52
  },
53
  {
54
- "epoch": 0.5714285714285714,
55
- "grad_norm": 0.3673703372478485,
56
  "learning_rate": 9.753805025397779e-05,
57
- "loss": 0.1547,
58
  "step": 70
59
  },
60
  {
61
- "epoch": 0.6530612244897959,
62
- "grad_norm": 0.31258246302604675,
63
  "learning_rate": 9.584400884284545e-05,
64
- "loss": 0.141,
65
  "step": 80
66
  },
67
  {
68
- "epoch": 0.7346938775510204,
69
- "grad_norm": 0.40288954973220825,
70
  "learning_rate": 9.373227124134888e-05,
71
- "loss": 0.1343,
72
  "step": 90
73
  },
74
  {
75
- "epoch": 0.8163265306122449,
76
- "grad_norm": 0.36042073369026184,
77
  "learning_rate": 9.122207801708802e-05,
78
- "loss": 0.1552,
79
  "step": 100
80
  },
81
  {
82
- "epoch": 0.8979591836734694,
83
- "grad_norm": 0.33795708417892456,
84
  "learning_rate": 8.833630016614976e-05,
85
- "loss": 0.1341,
86
  "step": 110
87
  },
88
  {
89
- "epoch": 0.9795918367346939,
90
- "grad_norm": 0.2802504003047943,
91
  "learning_rate": 8.510123072976239e-05,
92
- "loss": 0.1361,
93
  "step": 120
94
  },
95
  {
96
- "epoch": 1.0612244897959184,
97
- "grad_norm": 0.3562999367713928,
98
  "learning_rate": 8.154634523184388e-05,
99
- "loss": 0.1554,
100
  "step": 130
101
  },
102
  {
103
- "epoch": 1.1428571428571428,
104
- "grad_norm": 0.3551533818244934,
105
  "learning_rate": 7.770403312015721e-05,
106
- "loss": 0.1222,
107
  "step": 140
108
  },
109
  {
110
- "epoch": 1.2244897959183674,
111
- "grad_norm": 0.3744591474533081,
112
  "learning_rate": 7.360930265797935e-05,
113
- "loss": 0.1268,
114
  "step": 150
115
  },
116
  {
117
- "epoch": 1.306122448979592,
118
- "grad_norm": 0.3726373016834259,
119
  "learning_rate": 6.929946195508932e-05,
120
- "loss": 0.0923,
121
  "step": 160
122
  },
123
  {
124
- "epoch": 1.3877551020408163,
125
- "grad_norm": 0.45406657457351685,
126
  "learning_rate": 6.481377904428171e-05,
127
- "loss": 0.1236,
128
  "step": 170
129
  },
130
  {
131
- "epoch": 1.469387755102041,
132
- "grad_norm": 0.26440876722335815,
133
  "learning_rate": 6.019312410053286e-05,
134
- "loss": 0.1059,
135
  "step": 180
136
  },
137
  {
138
- "epoch": 1.5510204081632653,
139
- "grad_norm": 0.3584797978401184,
140
  "learning_rate": 5.547959706265068e-05,
141
- "loss": 0.13,
142
  "step": 190
143
  },
144
  {
145
- "epoch": 1.6326530612244898,
146
- "grad_norm": 0.2686370611190796,
147
  "learning_rate": 5.0716144050239375e-05,
148
- "loss": 0.1132,
149
  "step": 200
150
  },
151
  {
152
- "epoch": 1.7142857142857144,
153
- "grad_norm": 0.29573601484298706,
154
  "learning_rate": 4.594616607090028e-05,
155
- "loss": 0.1093,
156
  "step": 210
157
  },
158
  {
159
- "epoch": 1.7959183673469388,
160
- "grad_norm": 0.33462539315223694,
161
  "learning_rate": 4.121312358283463e-05,
162
- "loss": 0.1014,
163
  "step": 220
164
  },
165
  {
166
- "epoch": 1.8775510204081631,
167
- "grad_norm": 0.40894749760627747,
168
  "learning_rate": 3.656014051577713e-05,
169
- "loss": 0.1038,
170
  "step": 230
171
  },
172
  {
173
- "epoch": 1.9591836734693877,
174
- "grad_norm": 0.30836549401283264,
175
  "learning_rate": 3.202961135812437e-05,
176
- "loss": 0.1088,
177
  "step": 240
178
  },
179
  {
180
- "epoch": 2.0408163265306123,
181
- "grad_norm": 0.3176974654197693,
182
  "learning_rate": 2.7662814890184818e-05,
183
- "loss": 0.1055,
184
  "step": 250
185
  },
186
  {
187
- "epoch": 2.122448979591837,
188
- "grad_norm": 0.2630854845046997,
189
  "learning_rate": 2.3499538082923606e-05,
190
- "loss": 0.0942,
191
  "step": 260
192
  },
193
  {
194
- "epoch": 2.204081632653061,
195
- "grad_norm": 0.3280630111694336,
196
  "learning_rate": 1.9577713588953795e-05,
197
- "loss": 0.0954,
198
  "step": 270
199
  },
200
  {
201
- "epoch": 2.2857142857142856,
202
- "grad_norm": 0.2787545621395111,
203
  "learning_rate": 1.5933074128684332e-05,
204
- "loss": 0.0855,
205
  "step": 280
206
  },
207
  {
208
- "epoch": 2.36734693877551,
209
- "grad_norm": 0.3742433190345764,
210
  "learning_rate": 1.2598826920598772e-05,
211
- "loss": 0.1161,
212
  "step": 290
213
  },
214
  {
215
- "epoch": 2.4489795918367347,
216
- "grad_norm": 0.2906797528266907,
217
  "learning_rate": 9.605351122011309e-06,
218
- "loss": 0.0914,
219
  "step": 300
220
  },
221
  {
222
- "epoch": 2.5306122448979593,
223
- "grad_norm": 0.3414059579372406,
224
  "learning_rate": 6.979921036993042e-06,
225
- "loss": 0.0971,
226
  "step": 310
227
  },
228
  {
229
- "epoch": 2.612244897959184,
230
- "grad_norm": 0.2982410490512848,
231
  "learning_rate": 4.746457613389904e-06,
232
- "loss": 0.0855,
233
  "step": 320
234
  },
235
  {
236
- "epoch": 2.693877551020408,
237
- "grad_norm": 0.3430004417896271,
238
  "learning_rate": 2.925310493105099e-06,
239
- "loss": 0.0928,
240
  "step": 330
241
  },
242
  {
243
- "epoch": 2.7755102040816326,
244
- "grad_norm": 0.4764673709869385,
245
  "learning_rate": 1.5330726014397668e-06,
246
- "loss": 0.1081,
247
  "step": 340
248
  },
249
  {
250
- "epoch": 2.857142857142857,
251
- "grad_norm": 0.3845561742782593,
252
  "learning_rate": 5.824289648152126e-07,
253
- "loss": 0.0895,
254
  "step": 350
255
  },
256
  {
257
- "epoch": 2.938775510204082,
258
- "grad_norm": 0.3539472818374634,
259
  "learning_rate": 8.204113433559201e-08,
260
- "loss": 0.0974,
261
  "step": 360
262
  },
263
  {
264
- "epoch": 2.987755102040816,
265
  "step": 366,
266
- "total_flos": 9841219894640640.0,
267
- "train_loss": 0.14712934611273593,
268
- "train_runtime": 6269.6713,
269
- "train_samples_per_second": 0.469,
270
- "train_steps_per_second": 0.058
271
  }
272
  ],
273
  "logging_steps": 10,
@@ -287,7 +287,7 @@
287
  "attributes": {}
288
  }
289
  },
290
- "total_flos": 9841219894640640.0,
291
  "train_batch_size": 1,
292
  "trial_name": null,
293
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 2.984709480122324,
5
  "eval_steps": 500,
6
  "global_step": 366,
7
  "is_hyper_param_search": false,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.08154943934760449,
13
+ "grad_norm": 0.660578191280365,
14
  "learning_rate": 2.702702702702703e-05,
15
+ "loss": 0.7127,
16
  "step": 10
17
  },
18
  {
19
+ "epoch": 0.16309887869520898,
20
+ "grad_norm": 0.7271870374679565,
21
  "learning_rate": 5.405405405405406e-05,
22
+ "loss": 0.7168,
23
  "step": 20
24
  },
25
  {
26
+ "epoch": 0.24464831804281345,
27
+ "grad_norm": 0.7715198993682861,
28
  "learning_rate": 8.108108108108109e-05,
29
+ "loss": 0.5257,
30
  "step": 30
31
  },
32
  {
33
+ "epoch": 0.32619775739041795,
34
+ "grad_norm": 0.5615158081054688,
35
  "learning_rate": 9.997948550797227e-05,
36
+ "loss": 0.4019,
37
  "step": 40
38
  },
39
  {
40
+ "epoch": 0.4077471967380224,
41
+ "grad_norm": 0.4358239769935608,
42
  "learning_rate": 9.961525153583327e-05,
43
+ "loss": 0.3371,
44
  "step": 50
45
  },
46
  {
47
+ "epoch": 0.4892966360856269,
48
+ "grad_norm": 0.5554934740066528,
49
  "learning_rate": 9.879896064123961e-05,
50
+ "loss": 0.4258,
51
  "step": 60
52
  },
53
  {
54
+ "epoch": 0.5708460754332314,
55
+ "grad_norm": 0.5321815609931946,
56
  "learning_rate": 9.753805025397779e-05,
57
+ "loss": 0.375,
58
  "step": 70
59
  },
60
  {
61
+ "epoch": 0.6523955147808359,
62
+ "grad_norm": 0.42357194423675537,
63
  "learning_rate": 9.584400884284545e-05,
64
+ "loss": 0.368,
65
  "step": 80
66
  },
67
  {
68
+ "epoch": 0.7339449541284404,
69
+ "grad_norm": 0.502548098564148,
70
  "learning_rate": 9.373227124134888e-05,
71
+ "loss": 0.365,
72
  "step": 90
73
  },
74
  {
75
+ "epoch": 0.8154943934760448,
76
+ "grad_norm": 1.1985193490982056,
77
  "learning_rate": 9.122207801708802e-05,
78
+ "loss": 0.4522,
79
  "step": 100
80
  },
81
  {
82
+ "epoch": 0.8970438328236493,
83
+ "grad_norm": 0.5025231242179871,
84
  "learning_rate": 8.833630016614976e-05,
85
+ "loss": 0.3663,
86
  "step": 110
87
  },
88
  {
89
+ "epoch": 0.9785932721712538,
90
+ "grad_norm": 0.3899594247341156,
91
  "learning_rate": 8.510123072976239e-05,
92
+ "loss": 0.3597,
93
  "step": 120
94
  },
95
  {
96
+ "epoch": 1.0601427115188584,
97
+ "grad_norm": 0.4278734028339386,
98
  "learning_rate": 8.154634523184388e-05,
99
+ "loss": 0.3343,
100
  "step": 130
101
  },
102
  {
103
+ "epoch": 1.1416921508664628,
104
+ "grad_norm": 0.5638087391853333,
105
  "learning_rate": 7.770403312015721e-05,
106
+ "loss": 0.3215,
107
  "step": 140
108
  },
109
  {
110
+ "epoch": 1.2232415902140672,
111
+ "grad_norm": 0.5326379537582397,
112
  "learning_rate": 7.360930265797935e-05,
113
+ "loss": 0.3058,
114
  "step": 150
115
  },
116
  {
117
+ "epoch": 1.3047910295616718,
118
+ "grad_norm": 0.5562155842781067,
119
  "learning_rate": 6.929946195508932e-05,
120
+ "loss": 0.3273,
121
  "step": 160
122
  },
123
  {
124
+ "epoch": 1.3863404689092762,
125
+ "grad_norm": 0.6649357080459595,
126
  "learning_rate": 6.481377904428171e-05,
127
+ "loss": 0.3461,
128
  "step": 170
129
  },
130
  {
131
+ "epoch": 1.4678899082568808,
132
+ "grad_norm": 0.5380542278289795,
133
  "learning_rate": 6.019312410053286e-05,
134
+ "loss": 0.3111,
135
  "step": 180
136
  },
137
  {
138
+ "epoch": 1.5494393476044852,
139
+ "grad_norm": 0.6011327505111694,
140
  "learning_rate": 5.547959706265068e-05,
141
+ "loss": 0.287,
142
  "step": 190
143
  },
144
  {
145
+ "epoch": 1.6309887869520896,
146
+ "grad_norm": 0.5643779635429382,
147
  "learning_rate": 5.0716144050239375e-05,
148
+ "loss": 0.3202,
149
  "step": 200
150
  },
151
  {
152
+ "epoch": 1.7125382262996942,
153
+ "grad_norm": 0.5761425495147705,
154
  "learning_rate": 4.594616607090028e-05,
155
+ "loss": 0.3272,
156
  "step": 210
157
  },
158
  {
159
+ "epoch": 1.7940876656472988,
160
+ "grad_norm": 0.7082852125167847,
161
  "learning_rate": 4.121312358283463e-05,
162
+ "loss": 0.3277,
163
  "step": 220
164
  },
165
  {
166
+ "epoch": 1.8756371049949032,
167
+ "grad_norm": 0.41143250465393066,
168
  "learning_rate": 3.656014051577713e-05,
169
+ "loss": 0.3057,
170
  "step": 230
171
  },
172
  {
173
+ "epoch": 1.9571865443425076,
174
+ "grad_norm": 0.408113569021225,
175
  "learning_rate": 3.202961135812437e-05,
176
+ "loss": 0.3117,
177
  "step": 240
178
  },
179
  {
180
+ "epoch": 2.038735983690112,
181
+ "grad_norm": 0.47071775794029236,
182
  "learning_rate": 2.7662814890184818e-05,
183
+ "loss": 0.3175,
184
  "step": 250
185
  },
186
  {
187
+ "epoch": 2.120285423037717,
188
+ "grad_norm": 0.592940092086792,
189
  "learning_rate": 2.3499538082923606e-05,
190
+ "loss": 0.2854,
191
  "step": 260
192
  },
193
  {
194
+ "epoch": 2.2018348623853212,
195
+ "grad_norm": 0.569543182849884,
196
  "learning_rate": 1.9577713588953795e-05,
197
+ "loss": 0.254,
198
  "step": 270
199
  },
200
  {
201
+ "epoch": 2.2833843017329256,
202
+ "grad_norm": 0.6796756982803345,
203
  "learning_rate": 1.5933074128684332e-05,
204
+ "loss": 0.2637,
205
  "step": 280
206
  },
207
  {
208
+ "epoch": 2.36493374108053,
209
+ "grad_norm": 0.5621957182884216,
210
  "learning_rate": 1.2598826920598772e-05,
211
+ "loss": 0.2521,
212
  "step": 290
213
  },
214
  {
215
+ "epoch": 2.4464831804281344,
216
+ "grad_norm": 0.708060085773468,
217
  "learning_rate": 9.605351122011309e-06,
218
+ "loss": 0.2497,
219
  "step": 300
220
  },
221
  {
222
+ "epoch": 2.528032619775739,
223
+ "grad_norm": 0.583659827709198,
224
  "learning_rate": 6.979921036993042e-06,
225
+ "loss": 0.2822,
226
  "step": 310
227
  },
228
  {
229
+ "epoch": 2.6095820591233436,
230
+ "grad_norm": 0.6250892281532288,
231
  "learning_rate": 4.746457613389904e-06,
232
+ "loss": 0.2675,
233
  "step": 320
234
  },
235
  {
236
+ "epoch": 2.691131498470948,
237
+ "grad_norm": 0.6775749325752258,
238
  "learning_rate": 2.925310493105099e-06,
239
+ "loss": 0.2996,
240
  "step": 330
241
  },
242
  {
243
+ "epoch": 2.7726809378185524,
244
+ "grad_norm": 0.8949369192123413,
245
  "learning_rate": 1.5330726014397668e-06,
246
+ "loss": 0.3064,
247
  "step": 340
248
  },
249
  {
250
+ "epoch": 2.8542303771661572,
251
+ "grad_norm": 0.6756715178489685,
252
  "learning_rate": 5.824289648152126e-07,
253
+ "loss": 0.2577,
254
  "step": 350
255
  },
256
  {
257
+ "epoch": 2.9357798165137616,
258
+ "grad_norm": 0.4671032130718231,
259
  "learning_rate": 8.204113433559201e-08,
260
+ "loss": 0.2614,
261
  "step": 360
262
  },
263
  {
264
+ "epoch": 2.984709480122324,
265
  "step": 366,
266
+ "total_flos": 8710805828812800.0,
267
+ "train_loss": 0.3467143246384918,
268
+ "train_runtime": 5692.8921,
269
+ "train_samples_per_second": 0.517,
270
+ "train_steps_per_second": 0.064
271
  }
272
  ],
273
  "logging_steps": 10,
 
287
  "attributes": {}
288
  }
289
  },
290
+ "total_flos": 8710805828812800.0,
291
  "train_batch_size": 1,
292
  "trial_name": null,
293
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2d483a705da1c1d63c473066abd5241d98a6dac495fb8004c57333467fd5d881
3
  size 5432
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18db36ef5929943174913d64146ca832cb44d0b91ef63a7ab3fe26d3ed18fb1c
3
  size 5432
training_loss.png CHANGED