Delta-Vector commited on
Commit
a511d4c
·
verified ·
1 Parent(s): 892aa41

Training in progress, step 210, checkpoint

Browse files
Files changed (20) hide show
  1. checkpoint-210/global_step210/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +1 -1
  2. checkpoint-210/global_step210/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +1 -1
  3. checkpoint-210/global_step210/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +1 -1
  4. checkpoint-210/global_step210/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +1 -1
  5. checkpoint-210/global_step210/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +1 -1
  6. checkpoint-210/global_step210/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +1 -1
  7. checkpoint-210/global_step210/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +1 -1
  8. checkpoint-210/global_step210/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +1 -1
  9. checkpoint-210/global_step210/zero_pp_rank_0_mp_rank_00_model_states.pt +1 -1
  10. checkpoint-210/global_step210/zero_pp_rank_1_mp_rank_00_model_states.pt +1 -1
  11. checkpoint-210/global_step210/zero_pp_rank_2_mp_rank_00_model_states.pt +1 -1
  12. checkpoint-210/global_step210/zero_pp_rank_3_mp_rank_00_model_states.pt +1 -1
  13. checkpoint-210/global_step210/zero_pp_rank_4_mp_rank_00_model_states.pt +1 -1
  14. checkpoint-210/global_step210/zero_pp_rank_5_mp_rank_00_model_states.pt +1 -1
  15. checkpoint-210/global_step210/zero_pp_rank_6_mp_rank_00_model_states.pt +1 -1
  16. checkpoint-210/global_step210/zero_pp_rank_7_mp_rank_00_model_states.pt +1 -1
  17. checkpoint-210/model-00001-of-00002.safetensors +1 -1
  18. checkpoint-210/model-00002-of-00002.safetensors +1 -1
  19. checkpoint-210/trainer_state.json +432 -432
  20. checkpoint-210/training_args.bin +1 -1
checkpoint-210/global_step210/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5439969fc361ff852c6adcadaa6f8422f75b1c4d9577abbdb47742237ea284d2
3
  size 3402782599
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a964100ea5fca29615c410816916dff4331b771b98631be51c9dfad420d8dc0e
3
  size 3402782599
checkpoint-210/global_step210/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ca94ebbc2b41927e345a475b57b07bbebde2352f49eb2e01389abc5481c17653
3
  size 3402782599
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb1fbe340c6774fdd4be1723b46252a74d12ec659665f195b5e4a4cd2826776b
3
  size 3402782599
checkpoint-210/global_step210/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2d7eff2b58924e1815eb1d7264179128fcac4a7bf1d726837d9b98df23e7cf22
3
  size 3402782599
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a49654a6c631951b157efe0af80122e596176a6921398b0c50d0271805a21a9d
3
  size 3402782599
checkpoint-210/global_step210/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e4a1a49834444c03ae247951c1ab4b38824504a89a3570b7e37c5f7c89a2f5b6
3
  size 3402782599
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:520d0e55e451cb020000f91f118b036577130ead5d17172e6f147114e111bb41
3
  size 3402782599
checkpoint-210/global_step210/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e501f13e779784e88bbeb31ebd7e9899c55fdd1372b2aa1398f76ab461785d48
3
  size 3402782599
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dfda00dacdaa5a7b5bed6d5fa097f4c0e66887e15984e97860ce82d6c9ed4d84
3
  size 3402782599
checkpoint-210/global_step210/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:14b55d23000f932481ca1ee03f5242821e2ec13c76e75f13e763169e7f807b65
3
  size 3402782599
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1894b2e2b0fa6ebad413225d533b0efe9d116f5b43c1bb2aaf23578e8891dfba
3
  size 3402782599
checkpoint-210/global_step210/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:73889a07bc11090a095476103ecf7b65f7ca60e712fdf3649960b4dc7d48ec94
3
  size 3402782599
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:708026c96af37c8357c64d2cd5963f91211d30aa13a6dd11b6e7670bca29475e
3
  size 3402782599
checkpoint-210/global_step210/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dd35aefbd060783af9f0889e369c4cec1c55dc04e3d1bd0a1bac2749a6fd60aa
3
  size 3402782599
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc9e6b9c7381f8a59b46a41e7e495aa8c9a157942f598dba67c955720d2d5f29
3
  size 3402782599
checkpoint-210/global_step210/zero_pp_rank_0_mp_rank_00_model_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aa9cdf4f231f9565da95c795021c8bab2c84d38d67e870b70ca9ac3b5db3e406
3
  size 150245
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d267b5a5adc3b249e9b75831e76e86a47a12f768250b81d1e9298624098bc0ea
3
  size 150245
checkpoint-210/global_step210/zero_pp_rank_1_mp_rank_00_model_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:333aab5149bc5c5383397a9bda8848da74c34b5e8a6bf3ee6fc96f8c136f9d19
3
  size 150245
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7a264a507c6632110323bf2dd4ff9cffcbd94327358c5e2d5f7875ab99ef672
3
  size 150245
checkpoint-210/global_step210/zero_pp_rank_2_mp_rank_00_model_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:844b838dfe4fc34d298a7c7f9fe9428cc5029cdcda33e5ffb033f0a1541d8f55
3
  size 150245
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d0a2d8ea8538673663ea47f306617eaf01f6f590eaba5692481a2ac671d3e7e
3
  size 150245
checkpoint-210/global_step210/zero_pp_rank_3_mp_rank_00_model_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:86fc60090fd5cde3b58413c7bd09dcd59f654df46835f63114ba109d86f7ce5e
3
  size 150245
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3865c1f5c7c8a93a5d8d152259de016603bd0a15f0037b70530f92e42069893
3
  size 150245
checkpoint-210/global_step210/zero_pp_rank_4_mp_rank_00_model_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:47345124e9554305b5412464ecde8f73bad2da742ffe28ebdc9ed3532902dca6
3
  size 150245
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f891eda0cc8c1e1ee4ee0eb197830188cc3eb4f16eba81352fc6fe72e906ddc6
3
  size 150245
checkpoint-210/global_step210/zero_pp_rank_5_mp_rank_00_model_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c2425a9e81eac9199883a06de96df5662dd47ef3f0030b90d61ce1a67e7fb473
3
  size 150245
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e698ee6b9a1e09854f52fa65812a2669789b266475be3cc7a9699d6800b8fb0
3
  size 150245
checkpoint-210/global_step210/zero_pp_rank_6_mp_rank_00_model_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:55cd4356c201aed2ce55850198315a379593fcdb6f0aa90f5bb65a7754b86804
3
  size 150245
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:992728c31e93e4e06f4e0cb6901e95ce4e7835d5cf950eb080b66ef4308865e4
3
  size 150245
checkpoint-210/global_step210/zero_pp_rank_7_mp_rank_00_model_states.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8771a82bdc6003651dd20dea06121ac1ba627c645120f8d31bc084a4ebbe61e8
3
  size 150245
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92ad76ac01a9040705a38518fa43cbe8b381235bc1f21f8c8c9e1fae661b7263
3
  size 150245
checkpoint-210/model-00001-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:84c096c8d3a9b13352aff57b009e6ba81ddfbf6bd74ed35afb503bc99ecf8420
3
  size 4978354640
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5ae6ffc7d1099523179e5c3458e9b03d17cd4ce6e8653aada4b018133540637
3
  size 4978354640
checkpoint-210/model-00002-of-00002.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a4d7f35c184d870bbd8122a0ef2eed4c1516bf3d0a51f7d6297d6ca4050eda16
3
  size 4047172128
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d3d84c39fb4833d412123e6aa2b89388aa9e0f00b22ba52deedf5b586e8a41a
3
  size 4047172128
checkpoint-210/trainer_state.json CHANGED
@@ -11,1496 +11,1496 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.0023781212841854932,
14
- "grad_norm": 32.382706174573094,
15
  "learning_rate": 5.000000000000001e-07,
16
- "loss": 2.9481,
17
  "step": 1
18
  },
19
  {
20
  "epoch": 0.0023781212841854932,
21
- "eval_loss": 3.3739373683929443,
22
- "eval_runtime": 152.3599,
23
- "eval_samples_per_second": 4.194,
24
- "eval_steps_per_second": 0.525,
25
  "step": 1
26
  },
27
  {
28
  "epoch": 0.0047562425683709865,
29
- "grad_norm": 36.523706488132554,
30
  "learning_rate": 1.0000000000000002e-06,
31
- "loss": 2.8267,
32
  "step": 2
33
  },
34
  {
35
  "epoch": 0.007134363852556481,
36
- "grad_norm": 30.823955989385283,
37
  "learning_rate": 1.5e-06,
38
  "loss": 2.9936,
39
  "step": 3
40
  },
41
  {
42
  "epoch": 0.009512485136741973,
43
- "grad_norm": 27.113313789497223,
44
  "learning_rate": 2.0000000000000003e-06,
45
- "loss": 2.8945,
46
  "step": 4
47
  },
48
  {
49
  "epoch": 0.011890606420927468,
50
- "grad_norm": 13.505949871248614,
51
  "learning_rate": 2.5e-06,
52
- "loss": 2.7689,
53
  "step": 5
54
  },
55
  {
56
  "epoch": 0.014268727705112961,
57
- "grad_norm": 15.0338641814596,
58
  "learning_rate": 3e-06,
59
- "loss": 2.6611,
60
  "step": 6
61
  },
62
  {
63
  "epoch": 0.016646848989298454,
64
- "grad_norm": 14.319329036066911,
65
  "learning_rate": 3.5e-06,
66
- "loss": 2.8985,
67
  "step": 7
68
  },
69
  {
70
  "epoch": 0.019024970273483946,
71
- "grad_norm": 14.23176344511718,
72
  "learning_rate": 4.000000000000001e-06,
73
- "loss": 2.834,
74
  "step": 8
75
  },
76
  {
77
  "epoch": 0.02140309155766944,
78
- "grad_norm": 13.977227656063016,
79
  "learning_rate": 4.5e-06,
80
- "loss": 2.7566,
81
  "step": 9
82
  },
83
  {
84
  "epoch": 0.023781212841854936,
85
- "grad_norm": 11.952136262918579,
86
  "learning_rate": 5e-06,
87
- "loss": 2.5198,
88
  "step": 10
89
  },
90
  {
91
  "epoch": 0.026159334126040427,
92
- "grad_norm": 8.093250140015975,
93
  "learning_rate": 5.500000000000001e-06,
94
- "loss": 2.6486,
95
  "step": 11
96
  },
97
  {
98
  "epoch": 0.028537455410225922,
99
- "grad_norm": 4.914080620863233,
100
  "learning_rate": 6e-06,
101
- "loss": 2.4014,
102
  "step": 12
103
  },
104
  {
105
  "epoch": 0.030915576694411414,
106
- "grad_norm": 4.175963213321,
107
  "learning_rate": 6.5000000000000004e-06,
108
- "loss": 2.3925,
109
  "step": 13
110
  },
111
  {
112
  "epoch": 0.03329369797859691,
113
- "grad_norm": 4.098569808536842,
114
  "learning_rate": 7e-06,
115
- "loss": 2.5433,
116
  "step": 14
117
  },
118
  {
119
  "epoch": 0.0356718192627824,
120
- "grad_norm": 5.46644902034287,
121
  "learning_rate": 7.500000000000001e-06,
122
- "loss": 2.5083,
123
  "step": 15
124
  },
125
  {
126
  "epoch": 0.03804994054696789,
127
- "grad_norm": 5.787961659158823,
128
  "learning_rate": 8.000000000000001e-06,
129
- "loss": 2.6687,
130
  "step": 16
131
  },
132
  {
133
  "epoch": 0.04042806183115339,
134
- "grad_norm": 3.034638828158752,
135
  "learning_rate": 8.5e-06,
136
- "loss": 2.4193,
137
  "step": 17
138
  },
139
  {
140
  "epoch": 0.04280618311533888,
141
- "grad_norm": 3.6371355055713352,
142
  "learning_rate": 9e-06,
143
- "loss": 2.3729,
144
  "step": 18
145
  },
146
  {
147
  "epoch": 0.04518430439952437,
148
- "grad_norm": 2.1215238627086714,
149
  "learning_rate": 9.5e-06,
150
- "loss": 2.4799,
151
  "step": 19
152
  },
153
  {
154
  "epoch": 0.04756242568370987,
155
- "grad_norm": 3.987132196757572,
156
  "learning_rate": 1e-05,
157
- "loss": 2.3981,
158
  "step": 20
159
  },
160
  {
161
  "epoch": 0.04994054696789536,
162
- "grad_norm": 1.8643899752237216,
163
  "learning_rate": 1.0500000000000001e-05,
164
- "loss": 2.6625,
165
  "step": 21
166
  },
167
  {
168
  "epoch": 0.052318668252080855,
169
- "grad_norm": 1.9762840454371524,
170
  "learning_rate": 1.1000000000000001e-05,
171
- "loss": 2.4424,
172
  "step": 22
173
  },
174
  {
175
  "epoch": 0.054696789536266346,
176
- "grad_norm": 1.6430481169991258,
177
  "learning_rate": 1.15e-05,
178
- "loss": 2.3021,
179
  "step": 23
180
  },
181
  {
182
  "epoch": 0.057074910820451845,
183
- "grad_norm": 2.539281280765958,
184
  "learning_rate": 1.2e-05,
185
- "loss": 2.5471,
186
  "step": 24
187
  },
188
  {
189
  "epoch": 0.059453032104637336,
190
- "grad_norm": 1.457528590578011,
191
  "learning_rate": 1.25e-05,
192
- "loss": 2.2526,
193
  "step": 25
194
  },
195
  {
196
  "epoch": 0.06183115338882283,
197
- "grad_norm": 2.1155531281068884,
198
  "learning_rate": 1.3000000000000001e-05,
199
- "loss": 2.8667,
200
  "step": 26
201
  },
202
  {
203
  "epoch": 0.06420927467300833,
204
- "grad_norm": 1.73342938568033,
205
  "learning_rate": 1.3500000000000001e-05,
206
- "loss": 2.4915,
207
  "step": 27
208
  },
209
  {
210
  "epoch": 0.06658739595719382,
211
- "grad_norm": 1.7317610327219517,
212
  "learning_rate": 1.4e-05,
213
- "loss": 2.498,
214
  "step": 28
215
  },
216
  {
217
  "epoch": 0.06896551724137931,
218
- "grad_norm": 1.8363122567034704,
219
  "learning_rate": 1.45e-05,
220
- "loss": 2.4243,
221
  "step": 29
222
  },
223
  {
224
  "epoch": 0.0713436385255648,
225
- "grad_norm": 1.6309580804999364,
226
  "learning_rate": 1.5000000000000002e-05,
227
- "loss": 2.1101,
228
  "step": 30
229
  },
230
  {
231
  "epoch": 0.07372175980975029,
232
- "grad_norm": 1.334654998985815,
233
  "learning_rate": 1.55e-05,
234
- "loss": 2.1046,
235
  "step": 31
236
  },
237
  {
238
  "epoch": 0.07609988109393578,
239
- "grad_norm": 1.2376006409087055,
240
  "learning_rate": 1.6000000000000003e-05,
241
- "loss": 2.4625,
242
  "step": 32
243
  },
244
  {
245
  "epoch": 0.07847800237812129,
246
- "grad_norm": 1.8515124675338408,
247
  "learning_rate": 1.65e-05,
248
- "loss": 2.3255,
249
  "step": 33
250
  },
251
  {
252
  "epoch": 0.08085612366230678,
253
- "grad_norm": 2.4630792631286575,
254
  "learning_rate": 1.7e-05,
255
- "loss": 2.3863,
256
  "step": 34
257
  },
258
  {
259
  "epoch": 0.08323424494649227,
260
- "grad_norm": 1.4437945302508726,
261
  "learning_rate": 1.7500000000000002e-05,
262
- "loss": 2.6266,
263
  "step": 35
264
  },
265
  {
266
  "epoch": 0.08561236623067776,
267
- "grad_norm": 1.6987676732458694,
268
  "learning_rate": 1.8e-05,
269
- "loss": 2.2871,
270
  "step": 36
271
  },
272
  {
273
  "epoch": 0.08799048751486326,
274
- "grad_norm": 1.8155955386416776,
275
  "learning_rate": 1.8500000000000002e-05,
276
- "loss": 2.2534,
277
  "step": 37
278
  },
279
  {
280
  "epoch": 0.09036860879904875,
281
- "grad_norm": 1.1727394458786922,
282
  "learning_rate": 1.9e-05,
283
- "loss": 2.3458,
284
  "step": 38
285
  },
286
  {
287
  "epoch": 0.09274673008323424,
288
- "grad_norm": 1.4758138327709573,
289
  "learning_rate": 1.95e-05,
290
- "loss": 2.4411,
291
  "step": 39
292
  },
293
  {
294
  "epoch": 0.09512485136741974,
295
- "grad_norm": 2.683368502920404,
296
  "learning_rate": 2e-05,
297
- "loss": 2.5046,
298
  "step": 40
299
  },
300
  {
301
  "epoch": 0.09750297265160524,
302
- "grad_norm": 1.6044285552596205,
303
  "learning_rate": 1.9999981652287733e-05,
304
- "loss": 2.413,
305
  "step": 41
306
  },
307
  {
308
  "epoch": 0.09988109393579073,
309
- "grad_norm": 1.2802177698200177,
310
  "learning_rate": 1.999992660921826e-05,
311
- "loss": 2.2159,
312
  "step": 42
313
  },
314
  {
315
  "epoch": 0.10225921521997622,
316
- "grad_norm": 1.8164033444979393,
317
  "learning_rate": 1.999983487099356e-05,
318
- "loss": 2.5287,
319
  "step": 43
320
  },
321
  {
322
  "epoch": 0.10463733650416171,
323
- "grad_norm": 1.8420609096797744,
324
  "learning_rate": 1.999970643795027e-05,
325
- "loss": 2.3484,
326
  "step": 44
327
  },
328
  {
329
  "epoch": 0.1070154577883472,
330
- "grad_norm": 1.4183887604989038,
331
  "learning_rate": 1.9999541310559686e-05,
332
- "loss": 2.4876,
333
  "step": 45
334
  },
335
  {
336
  "epoch": 0.10939357907253269,
337
- "grad_norm": 1.419110897512839,
338
  "learning_rate": 1.9999339489427746e-05,
339
- "loss": 2.3251,
340
  "step": 46
341
  },
342
  {
343
  "epoch": 0.1117717003567182,
344
- "grad_norm": 1.6548987738436802,
345
  "learning_rate": 1.9999100975295046e-05,
346
- "loss": 2.358,
347
  "step": 47
348
  },
349
  {
350
  "epoch": 0.11414982164090369,
351
- "grad_norm": 1.7822053646902,
352
  "learning_rate": 1.999882576903682e-05,
353
- "loss": 2.5853,
354
  "step": 48
355
  },
356
  {
357
  "epoch": 0.11652794292508918,
358
- "grad_norm": 1.1885208188962406,
359
  "learning_rate": 1.9998513871662945e-05,
360
- "loss": 2.3794,
361
  "step": 49
362
  },
363
  {
364
  "epoch": 0.11890606420927467,
365
- "grad_norm": 1.7227839639402496,
366
  "learning_rate": 1.9998165284317944e-05,
367
- "loss": 2.1933,
368
  "step": 50
369
  },
370
  {
371
  "epoch": 0.12128418549346016,
372
- "grad_norm": 1.3654158039316031,
373
  "learning_rate": 1.999778000828098e-05,
374
- "loss": 2.0518,
375
  "step": 51
376
  },
377
  {
378
  "epoch": 0.12366230677764566,
379
- "grad_norm": 1.3281085763249942,
380
  "learning_rate": 1.9997358044965833e-05,
381
- "loss": 2.4352,
382
  "step": 52
383
  },
384
  {
385
  "epoch": 0.12604042806183116,
386
- "grad_norm": 1.1185472935459022,
387
  "learning_rate": 1.9996899395920915e-05,
388
- "loss": 2.4415,
389
  "step": 53
390
  },
391
  {
392
  "epoch": 0.12841854934601665,
393
- "grad_norm": 1.343325378951958,
394
  "learning_rate": 1.999640406282926e-05,
395
- "loss": 2.2661,
396
  "step": 54
397
  },
398
  {
399
  "epoch": 0.13079667063020214,
400
- "grad_norm": 1.371349840462257,
401
  "learning_rate": 1.9995872047508516e-05,
402
- "loss": 2.2813,
403
  "step": 55
404
  },
405
  {
406
  "epoch": 0.13317479191438764,
407
- "grad_norm": 1.2560078627168356,
408
  "learning_rate": 1.9995303351910934e-05,
409
- "loss": 2.494,
410
  "step": 56
411
  },
412
  {
413
  "epoch": 0.13555291319857313,
414
- "grad_norm": 1.3801141775649166,
415
  "learning_rate": 1.9994697978123363e-05,
416
- "loss": 2.1525,
417
  "step": 57
418
  },
419
  {
420
  "epoch": 0.13793103448275862,
421
- "grad_norm": 2.8895107871445167,
422
  "learning_rate": 1.9994055928367256e-05,
423
- "loss": 2.6727,
424
  "step": 58
425
  },
426
  {
427
  "epoch": 0.1403091557669441,
428
- "grad_norm": 1.231028014153545,
429
  "learning_rate": 1.999337720499863e-05,
430
- "loss": 2.3854,
431
  "step": 59
432
  },
433
  {
434
  "epoch": 0.1426872770511296,
435
- "grad_norm": 1.5588938179669447,
436
  "learning_rate": 1.99926618105081e-05,
437
- "loss": 2.033,
438
  "step": 60
439
  },
440
  {
441
  "epoch": 0.1450653983353151,
442
- "grad_norm": 1.1690094119529322,
443
  "learning_rate": 1.9991909747520835e-05,
444
- "loss": 2.1,
445
  "step": 61
446
  },
447
  {
448
  "epoch": 0.14744351961950058,
449
- "grad_norm": 1.5407871708756533,
450
  "learning_rate": 1.999112101879656e-05,
451
- "loss": 2.3949,
452
  "step": 62
453
  },
454
  {
455
  "epoch": 0.14982164090368608,
456
- "grad_norm": 1.0249789290696483,
457
  "learning_rate": 1.9990295627229544e-05,
458
- "loss": 2.463,
459
  "step": 63
460
  },
461
  {
462
  "epoch": 0.15219976218787157,
463
- "grad_norm": 1.5221618005694364,
464
  "learning_rate": 1.99894335758486e-05,
465
- "loss": 2.0575,
466
  "step": 64
467
  },
468
  {
469
  "epoch": 0.1545778834720571,
470
- "grad_norm": 1.0758079202744963,
471
  "learning_rate": 1.9988534867817065e-05,
472
- "loss": 2.0227,
473
  "step": 65
474
  },
475
  {
476
  "epoch": 0.15695600475624258,
477
- "grad_norm": 1.3430324490071632,
478
  "learning_rate": 1.9987599506432785e-05,
479
- "loss": 2.467,
480
  "step": 66
481
  },
482
  {
483
  "epoch": 0.15933412604042807,
484
- "grad_norm": 1.0714364507553935,
485
  "learning_rate": 1.9986627495128105e-05,
486
- "loss": 2.3812,
487
  "step": 67
488
  },
489
  {
490
  "epoch": 0.16171224732461356,
491
- "grad_norm": 1.296279438065536,
492
  "learning_rate": 1.9985618837469864e-05,
493
- "loss": 2.4045,
494
  "step": 68
495
  },
496
  {
497
  "epoch": 0.16409036860879905,
498
- "grad_norm": 1.1423020273292732,
499
  "learning_rate": 1.998457353715938e-05,
500
- "loss": 2.386,
501
  "step": 69
502
  },
503
  {
504
  "epoch": 0.16646848989298454,
505
- "grad_norm": 1.5165435155836928,
506
  "learning_rate": 1.998349159803241e-05,
507
- "loss": 2.3063,
508
  "step": 70
509
  },
510
  {
511
  "epoch": 0.16884661117717004,
512
- "grad_norm": 2.758568140385349,
513
  "learning_rate": 1.9982373024059195e-05,
514
- "loss": 2.4007,
515
  "step": 71
516
  },
517
  {
518
  "epoch": 0.17122473246135553,
519
- "grad_norm": 1.268753870102164,
520
  "learning_rate": 1.998121781934438e-05,
521
- "loss": 2.1938,
522
  "step": 72
523
  },
524
  {
525
  "epoch": 0.17360285374554102,
526
- "grad_norm": 1.1116353498200209,
527
  "learning_rate": 1.9980025988127037e-05,
528
- "loss": 2.1202,
529
  "step": 73
530
  },
531
  {
532
  "epoch": 0.1759809750297265,
533
- "grad_norm": 1.266229091733924,
534
  "learning_rate": 1.9978797534780646e-05,
535
- "loss": 2.4397,
536
  "step": 74
537
  },
538
  {
539
  "epoch": 0.178359096313912,
540
- "grad_norm": 1.2442419764935988,
541
  "learning_rate": 1.9977532463813064e-05,
542
- "loss": 2.4345,
543
  "step": 75
544
  },
545
  {
546
  "epoch": 0.1807372175980975,
547
- "grad_norm": 1.0991664177678122,
548
  "learning_rate": 1.9976230779866527e-05,
549
- "loss": 2.3604,
550
  "step": 76
551
  },
552
  {
553
  "epoch": 0.18311533888228299,
554
- "grad_norm": 1.4648399555465317,
555
  "learning_rate": 1.9974892487717613e-05,
556
- "loss": 2.4796,
557
  "step": 77
558
  },
559
  {
560
  "epoch": 0.18549346016646848,
561
- "grad_norm": 1.4075214014031838,
562
  "learning_rate": 1.997351759227725e-05,
563
- "loss": 2.4604,
564
  "step": 78
565
  },
566
  {
567
  "epoch": 0.187871581450654,
568
- "grad_norm": 1.1658962187610111,
569
  "learning_rate": 1.9972106098590665e-05,
570
- "loss": 2.3639,
571
  "step": 79
572
  },
573
  {
574
  "epoch": 0.1902497027348395,
575
- "grad_norm": 1.0920203834125308,
576
  "learning_rate": 1.9970658011837404e-05,
577
- "loss": 2.1962,
578
  "step": 80
579
  },
580
  {
581
  "epoch": 0.19262782401902498,
582
- "grad_norm": 1.0547465850317588,
583
  "learning_rate": 1.9969173337331283e-05,
584
- "loss": 2.2381,
585
  "step": 81
586
  },
587
  {
588
  "epoch": 0.19500594530321047,
589
- "grad_norm": 0.9700896578179515,
590
  "learning_rate": 1.996765208052037e-05,
591
- "loss": 1.9818,
592
  "step": 82
593
  },
594
  {
595
  "epoch": 0.19738406658739596,
596
- "grad_norm": 1.3721631541122385,
597
  "learning_rate": 1.9966094246986983e-05,
598
- "loss": 2.1842,
599
  "step": 83
600
  },
601
  {
602
  "epoch": 0.19976218787158145,
603
- "grad_norm": 1.1909292893353944,
604
  "learning_rate": 1.9964499842447665e-05,
605
- "loss": 2.5704,
606
  "step": 84
607
  },
608
  {
609
  "epoch": 0.20214030915576695,
610
- "grad_norm": 1.0845849173658786,
611
  "learning_rate": 1.9962868872753144e-05,
612
- "loss": 2.1158,
613
  "step": 85
614
  },
615
  {
616
  "epoch": 0.20451843043995244,
617
- "grad_norm": 1.1512213319968665,
618
  "learning_rate": 1.996120134388834e-05,
619
- "loss": 2.3564,
620
  "step": 86
621
  },
622
  {
623
  "epoch": 0.20689655172413793,
624
- "grad_norm": 1.1653706380489515,
625
  "learning_rate": 1.995949726197231e-05,
626
- "loss": 2.4664,
627
  "step": 87
628
  },
629
  {
630
  "epoch": 0.20927467300832342,
631
- "grad_norm": 1.2375083589347724,
632
  "learning_rate": 1.9957756633258264e-05,
633
- "loss": 2.2763,
634
  "step": 88
635
  },
636
  {
637
  "epoch": 0.2116527942925089,
638
- "grad_norm": 1.180995352120129,
639
  "learning_rate": 1.9955979464133515e-05,
640
- "loss": 2.28,
641
  "step": 89
642
  },
643
  {
644
  "epoch": 0.2140309155766944,
645
- "grad_norm": 1.0060015116786936,
646
  "learning_rate": 1.995416576111945e-05,
647
- "loss": 2.0939,
648
  "step": 90
649
  },
650
  {
651
  "epoch": 0.2164090368608799,
652
- "grad_norm": 1.123230557994905,
653
  "learning_rate": 1.9952315530871537e-05,
654
- "loss": 2.2575,
655
  "step": 91
656
  },
657
  {
658
  "epoch": 0.21878715814506539,
659
- "grad_norm": 1.033250223812201,
660
  "learning_rate": 1.9950428780179274e-05,
661
- "loss": 2.2192,
662
  "step": 92
663
  },
664
  {
665
  "epoch": 0.2211652794292509,
666
- "grad_norm": 1.01496288741104,
667
  "learning_rate": 1.994850551596617e-05,
668
- "loss": 2.3692,
669
  "step": 93
670
  },
671
  {
672
  "epoch": 0.2235434007134364,
673
- "grad_norm": 1.2509512883238079,
674
  "learning_rate": 1.9946545745289727e-05,
675
- "loss": 2.5349,
676
  "step": 94
677
  },
678
  {
679
  "epoch": 0.2259215219976219,
680
- "grad_norm": 1.1631889518067213,
681
  "learning_rate": 1.9944549475341404e-05,
682
- "loss": 2.2335,
683
  "step": 95
684
  },
685
  {
686
  "epoch": 0.22829964328180738,
687
- "grad_norm": 1.0176487393302203,
688
  "learning_rate": 1.99425167134466e-05,
689
- "loss": 2.325,
690
  "step": 96
691
  },
692
  {
693
  "epoch": 0.23067776456599287,
694
- "grad_norm": 1.0766159235170416,
695
  "learning_rate": 1.9940447467064624e-05,
696
- "loss": 2.4656,
697
  "step": 97
698
  },
699
  {
700
  "epoch": 0.23305588585017836,
701
- "grad_norm": 1.1250639831138038,
702
  "learning_rate": 1.9938341743788658e-05,
703
- "loss": 2.1741,
704
  "step": 98
705
  },
706
  {
707
  "epoch": 0.23543400713436385,
708
- "grad_norm": 1.1024674310720775,
709
  "learning_rate": 1.9936199551345744e-05,
710
- "loss": 2.1336,
711
  "step": 99
712
  },
713
  {
714
  "epoch": 0.23781212841854935,
715
- "grad_norm": 1.2527734635640946,
716
  "learning_rate": 1.9934020897596752e-05,
717
- "loss": 2.2741,
718
  "step": 100
719
  },
720
  {
721
  "epoch": 0.24019024970273484,
722
- "grad_norm": 0.9699919156060421,
723
  "learning_rate": 1.9931805790536342e-05,
724
- "loss": 2.2369,
725
  "step": 101
726
  },
727
  {
728
  "epoch": 0.24256837098692033,
729
- "grad_norm": 1.1834325873202396,
730
  "learning_rate": 1.9929554238292944e-05,
731
- "loss": 2.0419,
732
  "step": 102
733
  },
734
  {
735
  "epoch": 0.24494649227110582,
736
- "grad_norm": 1.1078024399344104,
737
  "learning_rate": 1.992726624912872e-05,
738
- "loss": 2.4991,
739
  "step": 103
740
  },
741
  {
742
  "epoch": 0.2473246135552913,
743
- "grad_norm": 1.1465140647519878,
744
  "learning_rate": 1.992494183143955e-05,
745
- "loss": 2.6218,
746
  "step": 104
747
  },
748
  {
749
  "epoch": 0.2497027348394768,
750
- "grad_norm": 1.143333452765713,
751
  "learning_rate": 1.9922580993754985e-05,
752
- "loss": 2.3428,
753
  "step": 105
754
  },
755
  {
756
  "epoch": 0.2497027348394768,
757
- "eval_loss": 2.499577760696411,
758
- "eval_runtime": 151.1827,
759
- "eval_samples_per_second": 4.227,
760
- "eval_steps_per_second": 0.529,
761
  "step": 105
762
  },
763
  {
764
  "epoch": 0.2520808561236623,
765
- "grad_norm": 1.0672145334944887,
766
  "learning_rate": 1.9920183744738208e-05,
767
- "loss": 2.3485,
768
  "step": 106
769
  },
770
  {
771
  "epoch": 0.2544589774078478,
772
- "grad_norm": 1.0044582600402383,
773
  "learning_rate": 1.9917750093186036e-05,
774
- "loss": 2.1678,
775
  "step": 107
776
  },
777
  {
778
  "epoch": 0.2568370986920333,
779
- "grad_norm": 1.1029447003396373,
780
  "learning_rate": 1.9915280048028853e-05,
781
- "loss": 2.3967,
782
  "step": 108
783
  },
784
  {
785
  "epoch": 0.25921521997621877,
786
- "grad_norm": 1.260079077740416,
787
  "learning_rate": 1.9912773618330595e-05,
788
- "loss": 2.4385,
789
  "step": 109
790
  },
791
  {
792
  "epoch": 0.2615933412604043,
793
- "grad_norm": 1.0846663572847435,
794
  "learning_rate": 1.9910230813288713e-05,
795
- "loss": 2.1431,
796
  "step": 110
797
  },
798
  {
799
  "epoch": 0.26397146254458975,
800
- "grad_norm": 0.991674367348856,
801
  "learning_rate": 1.9907651642234138e-05,
802
- "loss": 1.9523,
803
  "step": 111
804
  },
805
  {
806
  "epoch": 0.26634958382877527,
807
- "grad_norm": 1.105394178981242,
808
  "learning_rate": 1.9905036114631247e-05,
809
- "loss": 2.3063,
810
  "step": 112
811
  },
812
  {
813
  "epoch": 0.26872770511296074,
814
- "grad_norm": 0.9979450972746343,
815
  "learning_rate": 1.990238424007783e-05,
816
- "loss": 2.2218,
817
  "step": 113
818
  },
819
  {
820
  "epoch": 0.27110582639714625,
821
- "grad_norm": 1.2796796522419402,
822
  "learning_rate": 1.989969602830505e-05,
823
- "loss": 2.2271,
824
  "step": 114
825
  },
826
  {
827
  "epoch": 0.2734839476813318,
828
- "grad_norm": 1.1720433923322253,
829
  "learning_rate": 1.9896971489177417e-05,
830
- "loss": 2.2736,
831
  "step": 115
832
  },
833
  {
834
  "epoch": 0.27586206896551724,
835
- "grad_norm": 1.0744949490065667,
836
  "learning_rate": 1.9894210632692745e-05,
837
- "loss": 2.1864,
838
  "step": 116
839
  },
840
  {
841
  "epoch": 0.27824019024970276,
842
- "grad_norm": 1.9078078540875143,
843
  "learning_rate": 1.9891413468982112e-05,
844
- "loss": 2.2678,
845
  "step": 117
846
  },
847
  {
848
  "epoch": 0.2806183115338882,
849
- "grad_norm": 1.0427118571864202,
850
  "learning_rate": 1.988858000830983e-05,
851
- "loss": 2.1757,
852
  "step": 118
853
  },
854
  {
855
  "epoch": 0.28299643281807374,
856
- "grad_norm": 1.8597307152991636,
857
  "learning_rate": 1.9885710261073402e-05,
858
- "loss": 2.2913,
859
  "step": 119
860
  },
861
  {
862
  "epoch": 0.2853745541022592,
863
- "grad_norm": 1.0401590654407316,
864
  "learning_rate": 1.9882804237803487e-05,
865
- "loss": 2.0629,
866
  "step": 120
867
  },
868
  {
869
  "epoch": 0.2877526753864447,
870
- "grad_norm": 1.5079024059776849,
871
  "learning_rate": 1.9879861949163863e-05,
872
- "loss": 2.1827,
873
  "step": 121
874
  },
875
  {
876
  "epoch": 0.2901307966706302,
877
- "grad_norm": 1.1499157717530648,
878
  "learning_rate": 1.9876883405951378e-05,
879
- "loss": 2.2959,
880
  "step": 122
881
  },
882
  {
883
  "epoch": 0.2925089179548157,
884
- "grad_norm": 1.030399166377001,
885
  "learning_rate": 1.987386861909593e-05,
886
- "loss": 2.279,
887
  "step": 123
888
  },
889
  {
890
  "epoch": 0.29488703923900117,
891
- "grad_norm": 1.062349469745721,
892
  "learning_rate": 1.98708175996604e-05,
893
- "loss": 2.2865,
894
  "step": 124
895
  },
896
  {
897
  "epoch": 0.2972651605231867,
898
- "grad_norm": 1.0853089306877393,
899
  "learning_rate": 1.986773035884064e-05,
900
- "loss": 2.332,
901
  "step": 125
902
  },
903
  {
904
  "epoch": 0.29964328180737215,
905
- "grad_norm": 1.0066223498076698,
906
  "learning_rate": 1.9864606907965407e-05,
907
- "loss": 2.3971,
908
  "step": 126
909
  },
910
  {
911
  "epoch": 0.30202140309155767,
912
- "grad_norm": 1.101756427229776,
913
  "learning_rate": 1.986144725849634e-05,
914
- "loss": 2.2862,
915
  "step": 127
916
  },
917
  {
918
  "epoch": 0.30439952437574314,
919
- "grad_norm": 1.1264625682995106,
920
  "learning_rate": 1.9858251422027903e-05,
921
- "loss": 2.0974,
922
  "step": 128
923
  },
924
  {
925
  "epoch": 0.30677764565992865,
926
- "grad_norm": 1.0012622487091893,
927
  "learning_rate": 1.9855019410287355e-05,
928
- "loss": 2.2648,
929
  "step": 129
930
  },
931
  {
932
  "epoch": 0.3091557669441142,
933
- "grad_norm": 1.0030908712085922,
934
  "learning_rate": 1.98517512351347e-05,
935
- "loss": 2.2626,
936
  "step": 130
937
  },
938
  {
939
  "epoch": 0.31153388822829964,
940
- "grad_norm": 1.1376149636856583,
941
  "learning_rate": 1.9848446908562647e-05,
942
- "loss": 2.2315,
943
  "step": 131
944
  },
945
  {
946
  "epoch": 0.31391200951248516,
947
- "grad_norm": 0.9616294646184662,
948
  "learning_rate": 1.9845106442696563e-05,
949
- "loss": 2.4033,
950
  "step": 132
951
  },
952
  {
953
  "epoch": 0.3162901307966706,
954
- "grad_norm": 1.3402719458865533,
955
  "learning_rate": 1.9841729849794427e-05,
956
- "loss": 2.4429,
957
  "step": 133
958
  },
959
  {
960
  "epoch": 0.31866825208085614,
961
- "grad_norm": 1.3575140959134089,
962
  "learning_rate": 1.983831714224679e-05,
963
- "loss": 2.291,
964
  "step": 134
965
  },
966
  {
967
  "epoch": 0.3210463733650416,
968
- "grad_norm": 0.9521047622766085,
969
  "learning_rate": 1.9834868332576727e-05,
970
- "loss": 2.2754,
971
  "step": 135
972
  },
973
  {
974
  "epoch": 0.3234244946492271,
975
- "grad_norm": 1.0875381930222732,
976
  "learning_rate": 1.9831383433439798e-05,
977
- "loss": 2.1469,
978
  "step": 136
979
  },
980
  {
981
  "epoch": 0.3258026159334126,
982
- "grad_norm": 1.0257090212605473,
983
  "learning_rate": 1.982786245762398e-05,
984
- "loss": 2.0848,
985
  "step": 137
986
  },
987
  {
988
  "epoch": 0.3281807372175981,
989
- "grad_norm": 1.0372156134974286,
990
  "learning_rate": 1.9824305418049645e-05,
991
- "loss": 2.4043,
992
  "step": 138
993
  },
994
  {
995
  "epoch": 0.33055885850178357,
996
- "grad_norm": 0.9832029689246473,
997
  "learning_rate": 1.9820712327769503e-05,
998
- "loss": 2.177,
999
  "step": 139
1000
  },
1001
  {
1002
  "epoch": 0.3329369797859691,
1003
- "grad_norm": 1.0472107045649877,
1004
  "learning_rate": 1.9817083199968552e-05,
1005
- "loss": 2.3309,
1006
  "step": 140
1007
  },
1008
  {
1009
  "epoch": 0.33531510107015455,
1010
- "grad_norm": 1.0277879008926316,
1011
  "learning_rate": 1.9813418047964025e-05,
1012
- "loss": 2.1389,
1013
  "step": 141
1014
  },
1015
  {
1016
  "epoch": 0.3376932223543401,
1017
- "grad_norm": 1.15382327979194,
1018
  "learning_rate": 1.9809716885205363e-05,
1019
- "loss": 2.3254,
1020
  "step": 142
1021
  },
1022
  {
1023
  "epoch": 0.3400713436385256,
1024
- "grad_norm": 0.9933827586398313,
1025
  "learning_rate": 1.980597972527413e-05,
1026
- "loss": 2.2454,
1027
  "step": 143
1028
  },
1029
  {
1030
  "epoch": 0.34244946492271106,
1031
- "grad_norm": 0.9637942573486198,
1032
  "learning_rate": 1.9802206581883992e-05,
1033
- "loss": 2.1945,
1034
  "step": 144
1035
  },
1036
  {
1037
  "epoch": 0.3448275862068966,
1038
- "grad_norm": 0.918837452187297,
1039
  "learning_rate": 1.979839746888067e-05,
1040
- "loss": 2.1599,
1041
  "step": 145
1042
  },
1043
  {
1044
  "epoch": 0.34720570749108204,
1045
- "grad_norm": 0.962272148586432,
1046
  "learning_rate": 1.979455240024186e-05,
1047
- "loss": 2.1504,
1048
  "step": 146
1049
  },
1050
  {
1051
  "epoch": 0.34958382877526756,
1052
- "grad_norm": 1.0902104459187203,
1053
  "learning_rate": 1.97906713900772e-05,
1054
- "loss": 2.1671,
1055
  "step": 147
1056
  },
1057
  {
1058
  "epoch": 0.351961950059453,
1059
- "grad_norm": 0.9528336584077055,
1060
  "learning_rate": 1.9786754452628226e-05,
1061
- "loss": 2.113,
1062
  "step": 148
1063
  },
1064
  {
1065
  "epoch": 0.35434007134363854,
1066
- "grad_norm": 0.9216300894356183,
1067
  "learning_rate": 1.9782801602268306e-05,
1068
- "loss": 1.9271,
1069
  "step": 149
1070
  },
1071
  {
1072
  "epoch": 0.356718192627824,
1073
- "grad_norm": 0.9148117236258577,
1074
  "learning_rate": 1.9778812853502592e-05,
1075
- "loss": 2.0221,
1076
  "step": 150
1077
  },
1078
  {
1079
  "epoch": 0.3590963139120095,
1080
- "grad_norm": 0.9737793407143137,
1081
  "learning_rate": 1.9774788220967968e-05,
1082
- "loss": 2.2007,
1083
  "step": 151
1084
  },
1085
  {
1086
  "epoch": 0.361474435196195,
1087
- "grad_norm": 0.8979472123958888,
1088
  "learning_rate": 1.9770727719432994e-05,
1089
- "loss": 2.1295,
1090
  "step": 152
1091
  },
1092
  {
1093
  "epoch": 0.3638525564803805,
1094
- "grad_norm": 1.0391719811646754,
1095
  "learning_rate": 1.9766631363797852e-05,
1096
- "loss": 2.2383,
1097
  "step": 153
1098
  },
1099
  {
1100
  "epoch": 0.36623067776456597,
1101
- "grad_norm": 1.0898893562871452,
1102
  "learning_rate": 1.9762499169094288e-05,
1103
- "loss": 2.085,
1104
  "step": 154
1105
  },
1106
  {
1107
  "epoch": 0.3686087990487515,
1108
- "grad_norm": 2.9046586074281686,
1109
  "learning_rate": 1.9758331150485576e-05,
1110
- "loss": 2.2903,
1111
  "step": 155
1112
  },
1113
  {
1114
  "epoch": 0.37098692033293695,
1115
- "grad_norm": 1.0438135264691892,
1116
  "learning_rate": 1.9754127323266426e-05,
1117
- "loss": 2.349,
1118
  "step": 156
1119
  },
1120
  {
1121
  "epoch": 0.3733650416171225,
1122
- "grad_norm": 1.2082790526216014,
1123
  "learning_rate": 1.9749887702862972e-05,
1124
- "loss": 2.2182,
1125
  "step": 157
1126
  },
1127
  {
1128
  "epoch": 0.375743162901308,
1129
- "grad_norm": 1.2415740026323197,
1130
  "learning_rate": 1.9745612304832672e-05,
1131
- "loss": 2.4834,
1132
  "step": 158
1133
  },
1134
  {
1135
  "epoch": 0.37812128418549346,
1136
- "grad_norm": 0.9901876124346225,
1137
  "learning_rate": 1.9741301144864284e-05,
1138
- "loss": 2.2873,
1139
  "step": 159
1140
  },
1141
  {
1142
  "epoch": 0.380499405469679,
1143
- "grad_norm": 1.1185971951047096,
1144
  "learning_rate": 1.9736954238777793e-05,
1145
- "loss": 2.2114,
1146
  "step": 160
1147
  },
1148
  {
1149
  "epoch": 0.38287752675386444,
1150
- "grad_norm": 1.0186645068648283,
1151
  "learning_rate": 1.9732571602524353e-05,
1152
- "loss": 2.3323,
1153
  "step": 161
1154
  },
1155
  {
1156
  "epoch": 0.38525564803804996,
1157
- "grad_norm": 0.9856339888297305,
1158
  "learning_rate": 1.972815325218624e-05,
1159
- "loss": 2.2638,
1160
  "step": 162
1161
  },
1162
  {
1163
  "epoch": 0.3876337693222354,
1164
- "grad_norm": 1.287711819624049,
1165
  "learning_rate": 1.9723699203976768e-05,
1166
- "loss": 2.3897,
1167
  "step": 163
1168
  },
1169
  {
1170
  "epoch": 0.39001189060642094,
1171
- "grad_norm": 0.9474533284935532,
1172
  "learning_rate": 1.9719209474240263e-05,
1173
- "loss": 1.8287,
1174
  "step": 164
1175
  },
1176
  {
1177
  "epoch": 0.3923900118906064,
1178
- "grad_norm": 1.0505224096035144,
1179
  "learning_rate": 1.971468407945198e-05,
1180
- "loss": 2.3906,
1181
  "step": 165
1182
  },
1183
  {
1184
  "epoch": 0.3947681331747919,
1185
- "grad_norm": 0.9322039774829307,
1186
  "learning_rate": 1.9710123036218044e-05,
1187
- "loss": 2.0246,
1188
  "step": 166
1189
  },
1190
  {
1191
  "epoch": 0.3971462544589774,
1192
- "grad_norm": 1.1428006052468438,
1193
  "learning_rate": 1.97055263612754e-05,
1194
- "loss": 2.0085,
1195
  "step": 167
1196
  },
1197
  {
1198
  "epoch": 0.3995243757431629,
1199
- "grad_norm": 0.9233456322532203,
1200
  "learning_rate": 1.9700894071491736e-05,
1201
- "loss": 2.0657,
1202
  "step": 168
1203
  },
1204
  {
1205
  "epoch": 0.40190249702734837,
1206
- "grad_norm": 1.1387607148614496,
1207
  "learning_rate": 1.9696226183865436e-05,
1208
- "loss": 2.2507,
1209
  "step": 169
1210
  },
1211
  {
1212
  "epoch": 0.4042806183115339,
1213
- "grad_norm": 1.0240739510681864,
1214
  "learning_rate": 1.969152271552552e-05,
1215
- "loss": 2.1685,
1216
  "step": 170
1217
  },
1218
  {
1219
  "epoch": 0.40665873959571935,
1220
- "grad_norm": 1.2665975670284688,
1221
  "learning_rate": 1.9686783683731557e-05,
1222
- "loss": 2.3869,
1223
  "step": 171
1224
  },
1225
  {
1226
  "epoch": 0.4090368608799049,
1227
- "grad_norm": 1.0148421037850517,
1228
  "learning_rate": 1.9682009105873633e-05,
1229
- "loss": 2.1379,
1230
  "step": 172
1231
  },
1232
  {
1233
  "epoch": 0.4114149821640904,
1234
- "grad_norm": 1.0117482642225601,
1235
  "learning_rate": 1.9677198999472257e-05,
1236
- "loss": 2.1104,
1237
  "step": 173
1238
  },
1239
  {
1240
  "epoch": 0.41379310344827586,
1241
- "grad_norm": 0.9720066057353862,
1242
  "learning_rate": 1.967235338217832e-05,
1243
- "loss": 2.2884,
1244
  "step": 174
1245
  },
1246
  {
1247
  "epoch": 0.4161712247324614,
1248
- "grad_norm": 1.2836956527083296,
1249
  "learning_rate": 1.9667472271773026e-05,
1250
- "loss": 2.281,
1251
  "step": 175
1252
  },
1253
  {
1254
  "epoch": 0.41854934601664684,
1255
- "grad_norm": 0.9738075739171279,
1256
  "learning_rate": 1.9662555686167808e-05,
1257
- "loss": 2.2039,
1258
  "step": 176
1259
  },
1260
  {
1261
  "epoch": 0.42092746730083236,
1262
- "grad_norm": 1.0382703606377657,
1263
  "learning_rate": 1.965760364340429e-05,
1264
- "loss": 2.1142,
1265
  "step": 177
1266
  },
1267
  {
1268
  "epoch": 0.4233055885850178,
1269
- "grad_norm": 0.9318167386351257,
1270
  "learning_rate": 1.9652616161654204e-05,
1271
- "loss": 2.2409,
1272
  "step": 178
1273
  },
1274
  {
1275
  "epoch": 0.42568370986920334,
1276
- "grad_norm": 1.3023888510009893,
1277
  "learning_rate": 1.9647593259219328e-05,
1278
- "loss": 2.1972,
1279
  "step": 179
1280
  },
1281
  {
1282
  "epoch": 0.4280618311533888,
1283
- "grad_norm": 2.3590946551757583,
1284
  "learning_rate": 1.964253495453141e-05,
1285
- "loss": 2.1922,
1286
  "step": 180
1287
  },
1288
  {
1289
  "epoch": 0.4304399524375743,
1290
- "grad_norm": 1.277353180787431,
1291
  "learning_rate": 1.963744126615212e-05,
1292
- "loss": 2.384,
1293
  "step": 181
1294
  },
1295
  {
1296
  "epoch": 0.4328180737217598,
1297
- "grad_norm": 0.9912373962254282,
1298
  "learning_rate": 1.9632312212772956e-05,
1299
- "loss": 2.3005,
1300
  "step": 182
1301
  },
1302
  {
1303
  "epoch": 0.4351961950059453,
1304
- "grad_norm": 1.3088391002508755,
1305
  "learning_rate": 1.9627147813215207e-05,
1306
- "loss": 2.2924,
1307
  "step": 183
1308
  },
1309
  {
1310
  "epoch": 0.43757431629013077,
1311
- "grad_norm": 1.0523417119399674,
1312
  "learning_rate": 1.9621948086429847e-05,
1313
- "loss": 2.2255,
1314
  "step": 184
1315
  },
1316
  {
1317
  "epoch": 0.4399524375743163,
1318
- "grad_norm": 1.386504728655603,
1319
  "learning_rate": 1.9616713051497496e-05,
1320
- "loss": 2.1963,
1321
  "step": 185
1322
  },
1323
  {
1324
  "epoch": 0.4423305588585018,
1325
- "grad_norm": 0.9623064139518317,
1326
  "learning_rate": 1.9611442727628344e-05,
1327
- "loss": 2.3175,
1328
  "step": 186
1329
  },
1330
  {
1331
  "epoch": 0.4447086801426873,
1332
- "grad_norm": 1.7130236244329065,
1333
  "learning_rate": 1.960613713416206e-05,
1334
- "loss": 2.2246,
1335
  "step": 187
1336
  },
1337
  {
1338
  "epoch": 0.4470868014268728,
1339
- "grad_norm": 1.2183030146426241,
1340
  "learning_rate": 1.9600796290567747e-05,
1341
- "loss": 2.2345,
1342
  "step": 188
1343
  },
1344
  {
1345
  "epoch": 0.44946492271105826,
1346
- "grad_norm": 2.1564007093614497,
1347
  "learning_rate": 1.9595420216443864e-05,
1348
- "loss": 2.269,
1349
  "step": 189
1350
  },
1351
  {
1352
  "epoch": 0.4518430439952438,
1353
- "grad_norm": 1.1685306329495788,
1354
  "learning_rate": 1.9590008931518133e-05,
1355
- "loss": 2.4817,
1356
  "step": 190
1357
  },
1358
  {
1359
  "epoch": 0.45422116527942924,
1360
- "grad_norm": 1.9314760188762214,
1361
  "learning_rate": 1.9584562455647494e-05,
1362
- "loss": 2.2482,
1363
  "step": 191
1364
  },
1365
  {
1366
  "epoch": 0.45659928656361476,
1367
- "grad_norm": 1.2361103482091276,
1368
  "learning_rate": 1.9579080808818035e-05,
1369
- "loss": 2.2229,
1370
  "step": 192
1371
  },
1372
  {
1373
  "epoch": 0.4589774078478002,
1374
- "grad_norm": 1.7230964883799738,
1375
  "learning_rate": 1.9573564011144873e-05,
1376
- "loss": 2.142,
1377
  "step": 193
1378
  },
1379
  {
1380
  "epoch": 0.46135552913198574,
1381
- "grad_norm": 1.2344882256405865,
1382
  "learning_rate": 1.9568012082872148e-05,
1383
- "loss": 2.0984,
1384
  "step": 194
1385
  },
1386
  {
1387
  "epoch": 0.4637336504161712,
1388
- "grad_norm": 1.563594408096457,
1389
  "learning_rate": 1.9562425044372884e-05,
1390
- "loss": 1.9245,
1391
  "step": 195
1392
  },
1393
  {
1394
  "epoch": 0.4661117717003567,
1395
- "grad_norm": 1.942680386259649,
1396
  "learning_rate": 1.9556802916148963e-05,
1397
- "loss": 2.2633,
1398
  "step": 196
1399
  },
1400
  {
1401
  "epoch": 0.4684898929845422,
1402
- "grad_norm": 1.3339191105911707,
1403
  "learning_rate": 1.955114571883102e-05,
1404
- "loss": 2.1356,
1405
  "step": 197
1406
  },
1407
  {
1408
  "epoch": 0.4708680142687277,
1409
- "grad_norm": 2.204421390949867,
1410
  "learning_rate": 1.9545453473178384e-05,
1411
- "loss": 2.2506,
1412
  "step": 198
1413
  },
1414
  {
1415
  "epoch": 0.47324613555291317,
1416
- "grad_norm": 1.106826809772669,
1417
  "learning_rate": 1.9539726200078987e-05,
1418
- "loss": 2.0526,
1419
  "step": 199
1420
  },
1421
  {
1422
  "epoch": 0.4756242568370987,
1423
- "grad_norm": 2.7022504927488136,
1424
  "learning_rate": 1.9533963920549307e-05,
1425
- "loss": 2.3654,
1426
  "step": 200
1427
  },
1428
  {
1429
  "epoch": 0.4780023781212842,
1430
- "grad_norm": 2.20401718688023,
1431
  "learning_rate": 1.9528166655734267e-05,
1432
- "loss": 2.3523,
1433
  "step": 201
1434
  },
1435
  {
1436
  "epoch": 0.4803804994054697,
1437
- "grad_norm": 1.7098249314197853,
1438
  "learning_rate": 1.9522334426907185e-05,
1439
- "loss": 2.0789,
1440
  "step": 202
1441
  },
1442
  {
1443
  "epoch": 0.4827586206896552,
1444
- "grad_norm": 2.893610442346491,
1445
  "learning_rate": 1.951646725546966e-05,
1446
- "loss": 2.2456,
1447
  "step": 203
1448
  },
1449
  {
1450
  "epoch": 0.48513674197384066,
1451
- "grad_norm": 2.2821472751414937,
1452
  "learning_rate": 1.9510565162951538e-05,
1453
- "loss": 2.3351,
1454
  "step": 204
1455
  },
1456
  {
1457
  "epoch": 0.4875148632580262,
1458
- "grad_norm": 1.300863641854685,
1459
  "learning_rate": 1.950462817101079e-05,
1460
- "loss": 2.3617,
1461
  "step": 205
1462
  },
1463
  {
1464
  "epoch": 0.48989298454221164,
1465
- "grad_norm": 1.3899023575579752,
1466
  "learning_rate": 1.9498656301433466e-05,
1467
- "loss": 2.0628,
1468
  "step": 206
1469
  },
1470
  {
1471
  "epoch": 0.49227110582639716,
1472
- "grad_norm": 1.018395556873715,
1473
  "learning_rate": 1.9492649576133594e-05,
1474
- "loss": 2.2362,
1475
  "step": 207
1476
  },
1477
  {
1478
  "epoch": 0.4946492271105826,
1479
- "grad_norm": 1.4600451736607643,
1480
  "learning_rate": 1.94866080171531e-05,
1481
- "loss": 2.2194,
1482
  "step": 208
1483
  },
1484
  {
1485
  "epoch": 0.49702734839476814,
1486
- "grad_norm": 4.244045726741938,
1487
  "learning_rate": 1.9480531646661753e-05,
1488
- "loss": 2.434,
1489
  "step": 209
1490
  },
1491
  {
1492
  "epoch": 0.4994054696789536,
1493
- "grad_norm": 1.2125371460079588,
1494
  "learning_rate": 1.9474420486957045e-05,
1495
- "loss": 2.228,
1496
  "step": 210
1497
  },
1498
  {
1499
  "epoch": 0.4994054696789536,
1500
- "eval_loss": 2.4567697048187256,
1501
- "eval_runtime": 151.5129,
1502
- "eval_samples_per_second": 4.217,
1503
- "eval_steps_per_second": 0.528,
1504
  "step": 210
1505
  }
1506
  ],
@@ -1521,7 +1521,7 @@
1521
  "attributes": {}
1522
  }
1523
  },
1524
- "total_flos": 1.3027053762772992e+17,
1525
  "train_batch_size": 1,
1526
  "trial_name": null,
1527
  "trial_params": null
 
11
  "log_history": [
12
  {
13
  "epoch": 0.0023781212841854932,
14
+ "grad_norm": 32.74397118558861,
15
  "learning_rate": 5.000000000000001e-07,
16
+ "loss": 2.9478,
17
  "step": 1
18
  },
19
  {
20
  "epoch": 0.0023781212841854932,
21
+ "eval_loss": 3.373392343521118,
22
+ "eval_runtime": 78.9756,
23
+ "eval_samples_per_second": 8.091,
24
+ "eval_steps_per_second": 1.013,
25
  "step": 1
26
  },
27
  {
28
  "epoch": 0.0047562425683709865,
29
+ "grad_norm": 37.080911592721954,
30
  "learning_rate": 1.0000000000000002e-06,
31
+ "loss": 2.8264,
32
  "step": 2
33
  },
34
  {
35
  "epoch": 0.007134363852556481,
36
+ "grad_norm": 31.107267997702266,
37
  "learning_rate": 1.5e-06,
38
  "loss": 2.9936,
39
  "step": 3
40
  },
41
  {
42
  "epoch": 0.009512485136741973,
43
+ "grad_norm": 27.574905774161167,
44
  "learning_rate": 2.0000000000000003e-06,
45
+ "loss": 2.8944,
46
  "step": 4
47
  },
48
  {
49
  "epoch": 0.011890606420927468,
50
+ "grad_norm": 13.643957484299273,
51
  "learning_rate": 2.5e-06,
52
+ "loss": 2.7687,
53
  "step": 5
54
  },
55
  {
56
  "epoch": 0.014268727705112961,
57
+ "grad_norm": 15.077028980653411,
58
  "learning_rate": 3e-06,
59
+ "loss": 2.6623,
60
  "step": 6
61
  },
62
  {
63
  "epoch": 0.016646848989298454,
64
+ "grad_norm": 14.569557474559408,
65
  "learning_rate": 3.5e-06,
66
+ "loss": 2.9007,
67
  "step": 7
68
  },
69
  {
70
  "epoch": 0.019024970273483946,
71
+ "grad_norm": 13.894984550517007,
72
  "learning_rate": 4.000000000000001e-06,
73
+ "loss": 2.8359,
74
  "step": 8
75
  },
76
  {
77
  "epoch": 0.02140309155766944,
78
+ "grad_norm": 13.546442233583257,
79
  "learning_rate": 4.5e-06,
80
+ "loss": 2.7583,
81
  "step": 9
82
  },
83
  {
84
  "epoch": 0.023781212841854936,
85
+ "grad_norm": 11.166476039936938,
86
  "learning_rate": 5e-06,
87
+ "loss": 2.518,
88
  "step": 10
89
  },
90
  {
91
  "epoch": 0.026159334126040427,
92
+ "grad_norm": 7.787448179397784,
93
  "learning_rate": 5.500000000000001e-06,
94
+ "loss": 2.6494,
95
  "step": 11
96
  },
97
  {
98
  "epoch": 0.028537455410225922,
99
+ "grad_norm": 4.72349822440695,
100
  "learning_rate": 6e-06,
101
+ "loss": 2.4022,
102
  "step": 12
103
  },
104
  {
105
  "epoch": 0.030915576694411414,
106
+ "grad_norm": 4.100722460414476,
107
  "learning_rate": 6.5000000000000004e-06,
108
+ "loss": 2.3933,
109
  "step": 13
110
  },
111
  {
112
  "epoch": 0.03329369797859691,
113
+ "grad_norm": 4.193151112965372,
114
  "learning_rate": 7e-06,
115
+ "loss": 2.5468,
116
  "step": 14
117
  },
118
  {
119
  "epoch": 0.0356718192627824,
120
+ "grad_norm": 5.502246954578136,
121
  "learning_rate": 7.500000000000001e-06,
122
+ "loss": 2.5126,
123
  "step": 15
124
  },
125
  {
126
  "epoch": 0.03804994054696789,
127
+ "grad_norm": 5.716937946349337,
128
  "learning_rate": 8.000000000000001e-06,
129
+ "loss": 2.6761,
130
  "step": 16
131
  },
132
  {
133
  "epoch": 0.04042806183115339,
134
+ "grad_norm": 3.008867017303434,
135
  "learning_rate": 8.5e-06,
136
+ "loss": 2.4264,
137
  "step": 17
138
  },
139
  {
140
  "epoch": 0.04280618311533888,
141
+ "grad_norm": 3.6016120293217178,
142
  "learning_rate": 9e-06,
143
+ "loss": 2.3836,
144
  "step": 18
145
  },
146
  {
147
  "epoch": 0.04518430439952437,
148
+ "grad_norm": 2.1431386543975908,
149
  "learning_rate": 9.5e-06,
150
+ "loss": 2.4879,
151
  "step": 19
152
  },
153
  {
154
  "epoch": 0.04756242568370987,
155
+ "grad_norm": 3.838539096237921,
156
  "learning_rate": 1e-05,
157
+ "loss": 2.4116,
158
  "step": 20
159
  },
160
  {
161
  "epoch": 0.04994054696789536,
162
+ "grad_norm": 1.9119200890173822,
163
  "learning_rate": 1.0500000000000001e-05,
164
+ "loss": 2.6716,
165
  "step": 21
166
  },
167
  {
168
  "epoch": 0.052318668252080855,
169
+ "grad_norm": 2.0853737807318904,
170
  "learning_rate": 1.1000000000000001e-05,
171
+ "loss": 2.4523,
172
  "step": 22
173
  },
174
  {
175
  "epoch": 0.054696789536266346,
176
+ "grad_norm": 1.6109539551135528,
177
  "learning_rate": 1.15e-05,
178
+ "loss": 2.3086,
179
  "step": 23
180
  },
181
  {
182
  "epoch": 0.057074910820451845,
183
+ "grad_norm": 2.575933824126331,
184
  "learning_rate": 1.2e-05,
185
+ "loss": 2.5576,
186
  "step": 24
187
  },
188
  {
189
  "epoch": 0.059453032104637336,
190
+ "grad_norm": 1.4827660467902501,
191
  "learning_rate": 1.25e-05,
192
+ "loss": 2.264,
193
  "step": 25
194
  },
195
  {
196
  "epoch": 0.06183115338882283,
197
+ "grad_norm": 2.1022593224903128,
198
  "learning_rate": 1.3000000000000001e-05,
199
+ "loss": 2.8781,
200
  "step": 26
201
  },
202
  {
203
  "epoch": 0.06420927467300833,
204
+ "grad_norm": 2.756765764204405,
205
  "learning_rate": 1.3500000000000001e-05,
206
+ "loss": 2.5071,
207
  "step": 27
208
  },
209
  {
210
  "epoch": 0.06658739595719382,
211
+ "grad_norm": 1.6249455774563617,
212
  "learning_rate": 1.4e-05,
213
+ "loss": 2.5085,
214
  "step": 28
215
  },
216
  {
217
  "epoch": 0.06896551724137931,
218
+ "grad_norm": 1.8653671914008603,
219
  "learning_rate": 1.45e-05,
220
+ "loss": 2.4368,
221
  "step": 29
222
  },
223
  {
224
  "epoch": 0.0713436385255648,
225
+ "grad_norm": 1.867632087572305,
226
  "learning_rate": 1.5000000000000002e-05,
227
+ "loss": 2.1204,
228
  "step": 30
229
  },
230
  {
231
  "epoch": 0.07372175980975029,
232
+ "grad_norm": 1.2367329590346625,
233
  "learning_rate": 1.55e-05,
234
+ "loss": 2.1118,
235
  "step": 31
236
  },
237
  {
238
  "epoch": 0.07609988109393578,
239
+ "grad_norm": 1.5969207000997636,
240
  "learning_rate": 1.6000000000000003e-05,
241
+ "loss": 2.4747,
242
  "step": 32
243
  },
244
  {
245
  "epoch": 0.07847800237812129,
246
+ "grad_norm": 1.5075133034220278,
247
  "learning_rate": 1.65e-05,
248
+ "loss": 2.3349,
249
  "step": 33
250
  },
251
  {
252
  "epoch": 0.08085612366230678,
253
+ "grad_norm": 1.4803692479229955,
254
  "learning_rate": 1.7e-05,
255
+ "loss": 2.3915,
256
  "step": 34
257
  },
258
  {
259
  "epoch": 0.08323424494649227,
260
+ "grad_norm": 2.692949771759104,
261
  "learning_rate": 1.7500000000000002e-05,
262
+ "loss": 2.6397,
263
  "step": 35
264
  },
265
  {
266
  "epoch": 0.08561236623067776,
267
+ "grad_norm": 1.3636512770329847,
268
  "learning_rate": 1.8e-05,
269
+ "loss": 2.2944,
270
  "step": 36
271
  },
272
  {
273
  "epoch": 0.08799048751486326,
274
+ "grad_norm": 1.8310086696195464,
275
  "learning_rate": 1.8500000000000002e-05,
276
+ "loss": 2.2614,
277
  "step": 37
278
  },
279
  {
280
  "epoch": 0.09036860879904875,
281
+ "grad_norm": 1.7765027708264853,
282
  "learning_rate": 1.9e-05,
283
+ "loss": 2.3579,
284
  "step": 38
285
  },
286
  {
287
  "epoch": 0.09274673008323424,
288
+ "grad_norm": 1.4484769960901491,
289
  "learning_rate": 1.95e-05,
290
+ "loss": 2.4548,
291
  "step": 39
292
  },
293
  {
294
  "epoch": 0.09512485136741974,
295
+ "grad_norm": 3.1520205275209414,
296
  "learning_rate": 2e-05,
297
+ "loss": 2.5208,
298
  "step": 40
299
  },
300
  {
301
  "epoch": 0.09750297265160524,
302
+ "grad_norm": 1.5897739849482102,
303
  "learning_rate": 1.9999981652287733e-05,
304
+ "loss": 2.4216,
305
  "step": 41
306
  },
307
  {
308
  "epoch": 0.09988109393579073,
309
+ "grad_norm": 2.4520591326987495,
310
  "learning_rate": 1.999992660921826e-05,
311
+ "loss": 2.2326,
312
  "step": 42
313
  },
314
  {
315
  "epoch": 0.10225921521997622,
316
+ "grad_norm": 1.4817926319377914,
317
  "learning_rate": 1.999983487099356e-05,
318
+ "loss": 2.541,
319
  "step": 43
320
  },
321
  {
322
  "epoch": 0.10463733650416171,
323
+ "grad_norm": 2.4022755616863956,
324
  "learning_rate": 1.999970643795027e-05,
325
+ "loss": 2.3645,
326
  "step": 44
327
  },
328
  {
329
  "epoch": 0.1070154577883472,
330
+ "grad_norm": 2.476073007712477,
331
  "learning_rate": 1.9999541310559686e-05,
332
+ "loss": 2.5051,
333
  "step": 45
334
  },
335
  {
336
  "epoch": 0.10939357907253269,
337
+ "grad_norm": 3.428213096316913,
338
  "learning_rate": 1.9999339489427746e-05,
339
+ "loss": 2.3605,
340
  "step": 46
341
  },
342
  {
343
  "epoch": 0.1117717003567182,
344
+ "grad_norm": 2.464783346708793,
345
  "learning_rate": 1.9999100975295046e-05,
346
+ "loss": 2.3785,
347
  "step": 47
348
  },
349
  {
350
  "epoch": 0.11414982164090369,
351
+ "grad_norm": 2.0686444585541754,
352
  "learning_rate": 1.999882576903682e-05,
353
+ "loss": 2.6035,
354
  "step": 48
355
  },
356
  {
357
  "epoch": 0.11652794292508918,
358
+ "grad_norm": 2.30832669621963,
359
  "learning_rate": 1.9998513871662945e-05,
360
+ "loss": 2.3982,
361
  "step": 49
362
  },
363
  {
364
  "epoch": 0.11890606420927467,
365
+ "grad_norm": 1.7443884357427357,
366
  "learning_rate": 1.9998165284317944e-05,
367
+ "loss": 2.2344,
368
  "step": 50
369
  },
370
  {
371
  "epoch": 0.12128418549346016,
372
+ "grad_norm": 1.862498116501275,
373
  "learning_rate": 1.999778000828098e-05,
374
+ "loss": 2.0631,
375
  "step": 51
376
  },
377
  {
378
  "epoch": 0.12366230677764566,
379
+ "grad_norm": 1.3054707043181313,
380
  "learning_rate": 1.9997358044965833e-05,
381
+ "loss": 2.4434,
382
  "step": 52
383
  },
384
  {
385
  "epoch": 0.12604042806183116,
386
+ "grad_norm": 2.6645481874919583,
387
  "learning_rate": 1.9996899395920915e-05,
388
+ "loss": 2.4647,
389
  "step": 53
390
  },
391
  {
392
  "epoch": 0.12841854934601665,
393
+ "grad_norm": 1.6884769054479885,
394
  "learning_rate": 1.999640406282926e-05,
395
+ "loss": 2.2864,
396
  "step": 54
397
  },
398
  {
399
  "epoch": 0.13079667063020214,
400
+ "grad_norm": 2.281038989939936,
401
  "learning_rate": 1.9995872047508516e-05,
402
+ "loss": 2.2999,
403
  "step": 55
404
  },
405
  {
406
  "epoch": 0.13317479191438764,
407
+ "grad_norm": 1.8457967068582515,
408
  "learning_rate": 1.9995303351910934e-05,
409
+ "loss": 2.5117,
410
  "step": 56
411
  },
412
  {
413
  "epoch": 0.13555291319857313,
414
+ "grad_norm": 1.593901860141829,
415
  "learning_rate": 1.9994697978123363e-05,
416
+ "loss": 2.1696,
417
  "step": 57
418
  },
419
  {
420
  "epoch": 0.13793103448275862,
421
+ "grad_norm": 2.168613470386471,
422
  "learning_rate": 1.9994055928367256e-05,
423
+ "loss": 2.6852,
424
  "step": 58
425
  },
426
  {
427
  "epoch": 0.1403091557669441,
428
+ "grad_norm": 1.5398971278043392,
429
  "learning_rate": 1.999337720499863e-05,
430
+ "loss": 2.396,
431
  "step": 59
432
  },
433
  {
434
  "epoch": 0.1426872770511296,
435
+ "grad_norm": 1.7618365641290346,
436
  "learning_rate": 1.99926618105081e-05,
437
+ "loss": 2.047,
438
  "step": 60
439
  },
440
  {
441
  "epoch": 0.1450653983353151,
442
+ "grad_norm": 1.3935889380673343,
443
  "learning_rate": 1.9991909747520835e-05,
444
+ "loss": 2.1071,
445
  "step": 61
446
  },
447
  {
448
  "epoch": 0.14744351961950058,
449
+ "grad_norm": 1.3063537354837544,
450
  "learning_rate": 1.999112101879656e-05,
451
+ "loss": 2.3992,
452
  "step": 62
453
  },
454
  {
455
  "epoch": 0.14982164090368608,
456
+ "grad_norm": 1.025531260937785,
457
  "learning_rate": 1.9990295627229544e-05,
458
+ "loss": 2.4764,
459
  "step": 63
460
  },
461
  {
462
  "epoch": 0.15219976218787157,
463
+ "grad_norm": 1.0662434476421614,
464
  "learning_rate": 1.99894335758486e-05,
465
+ "loss": 2.0608,
466
  "step": 64
467
  },
468
  {
469
  "epoch": 0.1545778834720571,
470
+ "grad_norm": 1.335312659171346,
471
  "learning_rate": 1.9988534867817065e-05,
472
+ "loss": 2.0345,
473
  "step": 65
474
  },
475
  {
476
  "epoch": 0.15695600475624258,
477
+ "grad_norm": 1.7866360477276542,
478
  "learning_rate": 1.9987599506432785e-05,
479
+ "loss": 2.4781,
480
  "step": 66
481
  },
482
  {
483
  "epoch": 0.15933412604042807,
484
+ "grad_norm": 1.3661388854405736,
485
  "learning_rate": 1.9986627495128105e-05,
486
+ "loss": 2.3895,
487
  "step": 67
488
  },
489
  {
490
  "epoch": 0.16171224732461356,
491
+ "grad_norm": 1.69413799763372,
492
  "learning_rate": 1.9985618837469864e-05,
493
+ "loss": 2.415,
494
  "step": 68
495
  },
496
  {
497
  "epoch": 0.16409036860879905,
498
+ "grad_norm": 1.6458921632697698,
499
  "learning_rate": 1.998457353715938e-05,
500
+ "loss": 2.4017,
501
  "step": 69
502
  },
503
  {
504
  "epoch": 0.16646848989298454,
505
+ "grad_norm": 1.705150979387567,
506
  "learning_rate": 1.998349159803241e-05,
507
+ "loss": 2.317,
508
  "step": 70
509
  },
510
  {
511
  "epoch": 0.16884661117717004,
512
+ "grad_norm": 1.4786622132550975,
513
  "learning_rate": 1.9982373024059195e-05,
514
+ "loss": 2.4046,
515
  "step": 71
516
  },
517
  {
518
  "epoch": 0.17122473246135553,
519
+ "grad_norm": 1.1257378401253821,
520
  "learning_rate": 1.998121781934438e-05,
521
+ "loss": 2.196,
522
  "step": 72
523
  },
524
  {
525
  "epoch": 0.17360285374554102,
526
+ "grad_norm": 1.2450243917414825,
527
  "learning_rate": 1.9980025988127037e-05,
528
+ "loss": 2.1235,
529
  "step": 73
530
  },
531
  {
532
  "epoch": 0.1759809750297265,
533
+ "grad_norm": 1.2574977788035384,
534
  "learning_rate": 1.9978797534780646e-05,
535
+ "loss": 2.4466,
536
  "step": 74
537
  },
538
  {
539
  "epoch": 0.178359096313912,
540
+ "grad_norm": 1.390309850165232,
541
  "learning_rate": 1.9977532463813064e-05,
542
+ "loss": 2.4469,
543
  "step": 75
544
  },
545
  {
546
  "epoch": 0.1807372175980975,
547
+ "grad_norm": 1.2146735833909619,
548
  "learning_rate": 1.9976230779866527e-05,
549
+ "loss": 2.3705,
550
  "step": 76
551
  },
552
  {
553
  "epoch": 0.18311533888228299,
554
+ "grad_norm": 1.26356031715395,
555
  "learning_rate": 1.9974892487717613e-05,
556
+ "loss": 2.4926,
557
  "step": 77
558
  },
559
  {
560
  "epoch": 0.18549346016646848,
561
+ "grad_norm": 1.3934309027656608,
562
  "learning_rate": 1.997351759227725e-05,
563
+ "loss": 2.47,
564
  "step": 78
565
  },
566
  {
567
  "epoch": 0.187871581450654,
568
+ "grad_norm": 1.129998342751621,
569
  "learning_rate": 1.9972106098590665e-05,
570
+ "loss": 2.3718,
571
  "step": 79
572
  },
573
  {
574
  "epoch": 0.1902497027348395,
575
+ "grad_norm": 1.3957720808228478,
576
  "learning_rate": 1.9970658011837404e-05,
577
+ "loss": 2.2057,
578
  "step": 80
579
  },
580
  {
581
  "epoch": 0.19262782401902498,
582
+ "grad_norm": 1.5835508346410572,
583
  "learning_rate": 1.9969173337331283e-05,
584
+ "loss": 2.2551,
585
  "step": 81
586
  },
587
  {
588
  "epoch": 0.19500594530321047,
589
+ "grad_norm": 1.0726363834452401,
590
  "learning_rate": 1.996765208052037e-05,
591
+ "loss": 1.9962,
592
  "step": 82
593
  },
594
  {
595
  "epoch": 0.19738406658739596,
596
+ "grad_norm": 1.5380248440103288,
597
  "learning_rate": 1.9966094246986983e-05,
598
+ "loss": 2.1986,
599
  "step": 83
600
  },
601
  {
602
  "epoch": 0.19976218787158145,
603
+ "grad_norm": 1.2953387209833067,
604
  "learning_rate": 1.9964499842447665e-05,
605
+ "loss": 2.5842,
606
  "step": 84
607
  },
608
  {
609
  "epoch": 0.20214030915576695,
610
+ "grad_norm": 1.0712033116668103,
611
  "learning_rate": 1.9962868872753144e-05,
612
+ "loss": 2.1298,
613
  "step": 85
614
  },
615
  {
616
  "epoch": 0.20451843043995244,
617
+ "grad_norm": 1.095761902776689,
618
  "learning_rate": 1.996120134388834e-05,
619
+ "loss": 2.3641,
620
  "step": 86
621
  },
622
  {
623
  "epoch": 0.20689655172413793,
624
+ "grad_norm": 1.186679631328553,
625
  "learning_rate": 1.995949726197231e-05,
626
+ "loss": 2.4801,
627
  "step": 87
628
  },
629
  {
630
  "epoch": 0.20927467300832342,
631
+ "grad_norm": 1.1887498108170933,
632
  "learning_rate": 1.9957756633258264e-05,
633
+ "loss": 2.2866,
634
  "step": 88
635
  },
636
  {
637
  "epoch": 0.2116527942925089,
638
+ "grad_norm": 1.0909023774872124,
639
  "learning_rate": 1.9955979464133515e-05,
640
+ "loss": 2.2916,
641
  "step": 89
642
  },
643
  {
644
  "epoch": 0.2140309155766944,
645
+ "grad_norm": 1.1077175360558418,
646
  "learning_rate": 1.995416576111945e-05,
647
+ "loss": 2.1077,
648
  "step": 90
649
  },
650
  {
651
  "epoch": 0.2164090368608799,
652
+ "grad_norm": 1.1526064687436712,
653
  "learning_rate": 1.9952315530871537e-05,
654
+ "loss": 2.2723,
655
  "step": 91
656
  },
657
  {
658
  "epoch": 0.21878715814506539,
659
+ "grad_norm": 1.1818210038912647,
660
  "learning_rate": 1.9950428780179274e-05,
661
+ "loss": 2.2338,
662
  "step": 92
663
  },
664
  {
665
  "epoch": 0.2211652794292509,
666
+ "grad_norm": 1.164942154271255,
667
  "learning_rate": 1.994850551596617e-05,
668
+ "loss": 2.3817,
669
  "step": 93
670
  },
671
  {
672
  "epoch": 0.2235434007134364,
673
+ "grad_norm": 1.339398993177121,
674
  "learning_rate": 1.9946545745289727e-05,
675
+ "loss": 2.5508,
676
  "step": 94
677
  },
678
  {
679
  "epoch": 0.2259215219976219,
680
+ "grad_norm": 1.3267763052855093,
681
  "learning_rate": 1.9944549475341404e-05,
682
+ "loss": 2.247,
683
  "step": 95
684
  },
685
  {
686
  "epoch": 0.22829964328180738,
687
+ "grad_norm": 1.1953250811556597,
688
  "learning_rate": 1.99425167134466e-05,
689
+ "loss": 2.3373,
690
  "step": 96
691
  },
692
  {
693
  "epoch": 0.23067776456599287,
694
+ "grad_norm": 1.4321452409301854,
695
  "learning_rate": 1.9940447467064624e-05,
696
+ "loss": 2.4776,
697
  "step": 97
698
  },
699
  {
700
  "epoch": 0.23305588585017836,
701
+ "grad_norm": 1.0224444212683161,
702
  "learning_rate": 1.9938341743788658e-05,
703
+ "loss": 2.1837,
704
  "step": 98
705
  },
706
  {
707
  "epoch": 0.23543400713436385,
708
+ "grad_norm": 1.0977574950238398,
709
  "learning_rate": 1.9936199551345744e-05,
710
+ "loss": 2.1478,
711
  "step": 99
712
  },
713
  {
714
  "epoch": 0.23781212841854935,
715
+ "grad_norm": 1.0660069054078747,
716
  "learning_rate": 1.9934020897596752e-05,
717
+ "loss": 2.2816,
718
  "step": 100
719
  },
720
  {
721
  "epoch": 0.24019024970273484,
722
+ "grad_norm": 1.0312993672336248,
723
  "learning_rate": 1.9931805790536342e-05,
724
+ "loss": 2.2468,
725
  "step": 101
726
  },
727
  {
728
  "epoch": 0.24256837098692033,
729
+ "grad_norm": 1.1278898252066067,
730
  "learning_rate": 1.9929554238292944e-05,
731
+ "loss": 2.0526,
732
  "step": 102
733
  },
734
  {
735
  "epoch": 0.24494649227110582,
736
+ "grad_norm": 1.288343002944789,
737
  "learning_rate": 1.992726624912872e-05,
738
+ "loss": 2.512,
739
  "step": 103
740
  },
741
  {
742
  "epoch": 0.2473246135552913,
743
+ "grad_norm": 1.1840020386119305,
744
  "learning_rate": 1.992494183143955e-05,
745
+ "loss": 2.6356,
746
  "step": 104
747
  },
748
  {
749
  "epoch": 0.2497027348394768,
750
+ "grad_norm": 1.1014678408276726,
751
  "learning_rate": 1.9922580993754985e-05,
752
+ "loss": 2.3521,
753
  "step": 105
754
  },
755
  {
756
  "epoch": 0.2497027348394768,
757
+ "eval_loss": 2.5365779399871826,
758
+ "eval_runtime": 66.0796,
759
+ "eval_samples_per_second": 9.67,
760
+ "eval_steps_per_second": 1.211,
761
  "step": 105
762
  },
763
  {
764
  "epoch": 0.2520808561236623,
765
+ "grad_norm": 0.9957844757920508,
766
  "learning_rate": 1.9920183744738208e-05,
767
+ "loss": 2.355,
768
  "step": 106
769
  },
770
  {
771
  "epoch": 0.2544589774078478,
772
+ "grad_norm": 1.0070598447313825,
773
  "learning_rate": 1.9917750093186036e-05,
774
+ "loss": 2.1747,
775
  "step": 107
776
  },
777
  {
778
  "epoch": 0.2568370986920333,
779
+ "grad_norm": 1.2463453868295562,
780
  "learning_rate": 1.9915280048028853e-05,
781
+ "loss": 2.4131,
782
  "step": 108
783
  },
784
  {
785
  "epoch": 0.25921521997621877,
786
+ "grad_norm": 1.158558292534161,
787
  "learning_rate": 1.9912773618330595e-05,
788
+ "loss": 2.4527,
789
  "step": 109
790
  },
791
  {
792
  "epoch": 0.2615933412604043,
793
+ "grad_norm": 1.1875643459332377,
794
  "learning_rate": 1.9910230813288713e-05,
795
+ "loss": 2.1523,
796
  "step": 110
797
  },
798
  {
799
  "epoch": 0.26397146254458975,
800
+ "grad_norm": 0.892269173897758,
801
  "learning_rate": 1.9907651642234138e-05,
802
+ "loss": 1.9606,
803
  "step": 111
804
  },
805
  {
806
  "epoch": 0.26634958382877527,
807
+ "grad_norm": 1.181952902180908,
808
  "learning_rate": 1.9905036114631247e-05,
809
+ "loss": 2.3201,
810
  "step": 112
811
  },
812
  {
813
  "epoch": 0.26872770511296074,
814
+ "grad_norm": 0.9689153704257877,
815
  "learning_rate": 1.990238424007783e-05,
816
+ "loss": 2.2329,
817
  "step": 113
818
  },
819
  {
820
  "epoch": 0.27110582639714625,
821
+ "grad_norm": 1.3665918769424286,
822
  "learning_rate": 1.989969602830505e-05,
823
+ "loss": 2.2387,
824
  "step": 114
825
  },
826
  {
827
  "epoch": 0.2734839476813318,
828
+ "grad_norm": 1.0478434719151144,
829
  "learning_rate": 1.9896971489177417e-05,
830
+ "loss": 2.2798,
831
  "step": 115
832
  },
833
  {
834
  "epoch": 0.27586206896551724,
835
+ "grad_norm": 1.5752154316391798,
836
  "learning_rate": 1.9894210632692745e-05,
837
+ "loss": 2.201,
838
  "step": 116
839
  },
840
  {
841
  "epoch": 0.27824019024970276,
842
+ "grad_norm": 1.0264277011384757,
843
  "learning_rate": 1.9891413468982112e-05,
844
+ "loss": 2.2756,
845
  "step": 117
846
  },
847
  {
848
  "epoch": 0.2806183115338882,
849
+ "grad_norm": 1.1750703393359614,
850
  "learning_rate": 1.988858000830983e-05,
851
+ "loss": 2.1907,
852
  "step": 118
853
  },
854
  {
855
  "epoch": 0.28299643281807374,
856
+ "grad_norm": 0.9456957190962577,
857
  "learning_rate": 1.9885710261073402e-05,
858
+ "loss": 2.2993,
859
  "step": 119
860
  },
861
  {
862
  "epoch": 0.2853745541022592,
863
+ "grad_norm": 1.37591692336223,
864
  "learning_rate": 1.9882804237803487e-05,
865
+ "loss": 2.0751,
866
  "step": 120
867
  },
868
  {
869
  "epoch": 0.2877526753864447,
870
+ "grad_norm": 0.991181263305241,
871
  "learning_rate": 1.9879861949163863e-05,
872
+ "loss": 2.1946,
873
  "step": 121
874
  },
875
  {
876
  "epoch": 0.2901307966706302,
877
+ "grad_norm": 1.2826616603092615,
878
  "learning_rate": 1.9876883405951378e-05,
879
+ "loss": 2.3084,
880
  "step": 122
881
  },
882
  {
883
  "epoch": 0.2925089179548157,
884
+ "grad_norm": 1.3162982027829009,
885
  "learning_rate": 1.987386861909593e-05,
886
+ "loss": 2.294,
887
  "step": 123
888
  },
889
  {
890
  "epoch": 0.29488703923900117,
891
+ "grad_norm": 1.086311999313279,
892
  "learning_rate": 1.98708175996604e-05,
893
+ "loss": 2.3025,
894
  "step": 124
895
  },
896
  {
897
  "epoch": 0.2972651605231867,
898
+ "grad_norm": 1.10683170372015,
899
  "learning_rate": 1.986773035884064e-05,
900
+ "loss": 2.3447,
901
  "step": 125
902
  },
903
  {
904
  "epoch": 0.29964328180737215,
905
+ "grad_norm": 1.090568761480393,
906
  "learning_rate": 1.9864606907965407e-05,
907
+ "loss": 2.4104,
908
  "step": 126
909
  },
910
  {
911
  "epoch": 0.30202140309155767,
912
+ "grad_norm": 1.4024759238343605,
913
  "learning_rate": 1.986144725849634e-05,
914
+ "loss": 2.298,
915
  "step": 127
916
  },
917
  {
918
  "epoch": 0.30439952437574314,
919
+ "grad_norm": 0.9324914520062791,
920
  "learning_rate": 1.9858251422027903e-05,
921
+ "loss": 2.1123,
922
  "step": 128
923
  },
924
  {
925
  "epoch": 0.30677764565992865,
926
+ "grad_norm": 1.3818136151492852,
927
  "learning_rate": 1.9855019410287355e-05,
928
+ "loss": 2.2786,
929
  "step": 129
930
  },
931
  {
932
  "epoch": 0.3091557669441142,
933
+ "grad_norm": 0.9879756737720099,
934
  "learning_rate": 1.98517512351347e-05,
935
+ "loss": 2.2735,
936
  "step": 130
937
  },
938
  {
939
  "epoch": 0.31153388822829964,
940
+ "grad_norm": 1.4107106057474024,
941
  "learning_rate": 1.9848446908562647e-05,
942
+ "loss": 2.2421,
943
  "step": 131
944
  },
945
  {
946
  "epoch": 0.31391200951248516,
947
+ "grad_norm": 0.978862094447652,
948
  "learning_rate": 1.9845106442696563e-05,
949
+ "loss": 2.4152,
950
  "step": 132
951
  },
952
  {
953
  "epoch": 0.3162901307966706,
954
+ "grad_norm": 1.3714074038447606,
955
  "learning_rate": 1.9841729849794427e-05,
956
+ "loss": 2.4567,
957
  "step": 133
958
  },
959
  {
960
  "epoch": 0.31866825208085614,
961
+ "grad_norm": 1.030641093673837,
962
  "learning_rate": 1.983831714224679e-05,
963
+ "loss": 2.3015,
964
  "step": 134
965
  },
966
  {
967
  "epoch": 0.3210463733650416,
968
+ "grad_norm": 1.1744699755999302,
969
  "learning_rate": 1.9834868332576727e-05,
970
+ "loss": 2.2878,
971
  "step": 135
972
  },
973
  {
974
  "epoch": 0.3234244946492271,
975
+ "grad_norm": 0.9733999816490441,
976
  "learning_rate": 1.9831383433439798e-05,
977
+ "loss": 2.1571,
978
  "step": 136
979
  },
980
  {
981
  "epoch": 0.3258026159334126,
982
+ "grad_norm": 1.0470367999253474,
983
  "learning_rate": 1.982786245762398e-05,
984
+ "loss": 2.0943,
985
  "step": 137
986
  },
987
  {
988
  "epoch": 0.3281807372175981,
989
+ "grad_norm": 1.0748276455064096,
990
  "learning_rate": 1.9824305418049645e-05,
991
+ "loss": 2.4156,
992
  "step": 138
993
  },
994
  {
995
  "epoch": 0.33055885850178357,
996
+ "grad_norm": 1.0220509349947084,
997
  "learning_rate": 1.9820712327769503e-05,
998
+ "loss": 2.1898,
999
  "step": 139
1000
  },
1001
  {
1002
  "epoch": 0.3329369797859691,
1003
+ "grad_norm": 0.9811166423920332,
1004
  "learning_rate": 1.9817083199968552e-05,
1005
+ "loss": 2.3449,
1006
  "step": 140
1007
  },
1008
  {
1009
  "epoch": 0.33531510107015455,
1010
+ "grad_norm": 1.0664757695722766,
1011
  "learning_rate": 1.9813418047964025e-05,
1012
+ "loss": 2.1514,
1013
  "step": 141
1014
  },
1015
  {
1016
  "epoch": 0.3376932223543401,
1017
+ "grad_norm": 1.1228830278366924,
1018
  "learning_rate": 1.9809716885205363e-05,
1019
+ "loss": 2.3371,
1020
  "step": 142
1021
  },
1022
  {
1023
  "epoch": 0.3400713436385256,
1024
+ "grad_norm": 1.0703957613617774,
1025
  "learning_rate": 1.980597972527413e-05,
1026
+ "loss": 2.2577,
1027
  "step": 143
1028
  },
1029
  {
1030
  "epoch": 0.34244946492271106,
1031
+ "grad_norm": 0.9971842999532138,
1032
  "learning_rate": 1.9802206581883992e-05,
1033
+ "loss": 2.2048,
1034
  "step": 144
1035
  },
1036
  {
1037
  "epoch": 0.3448275862068966,
1038
+ "grad_norm": 0.9969712850303254,
1039
  "learning_rate": 1.979839746888067e-05,
1040
+ "loss": 2.1725,
1041
  "step": 145
1042
  },
1043
  {
1044
  "epoch": 0.34720570749108204,
1045
+ "grad_norm": 0.9782490093980141,
1046
  "learning_rate": 1.979455240024186e-05,
1047
+ "loss": 2.1598,
1048
  "step": 146
1049
  },
1050
  {
1051
  "epoch": 0.34958382877526756,
1052
+ "grad_norm": 1.1595035293528873,
1053
  "learning_rate": 1.97906713900772e-05,
1054
+ "loss": 2.1812,
1055
  "step": 147
1056
  },
1057
  {
1058
  "epoch": 0.351961950059453,
1059
+ "grad_norm": 1.0488323565717943,
1060
  "learning_rate": 1.9786754452628226e-05,
1061
+ "loss": 2.126,
1062
  "step": 148
1063
  },
1064
  {
1065
  "epoch": 0.35434007134363854,
1066
+ "grad_norm": 1.0236205683546673,
1067
  "learning_rate": 1.9782801602268306e-05,
1068
+ "loss": 1.9399,
1069
  "step": 149
1070
  },
1071
  {
1072
  "epoch": 0.356718192627824,
1073
+ "grad_norm": 0.983049547537296,
1074
  "learning_rate": 1.9778812853502592e-05,
1075
+ "loss": 2.0336,
1076
  "step": 150
1077
  },
1078
  {
1079
  "epoch": 0.3590963139120095,
1080
+ "grad_norm": 1.0856474713800959,
1081
  "learning_rate": 1.9774788220967968e-05,
1082
+ "loss": 2.2103,
1083
  "step": 151
1084
  },
1085
  {
1086
  "epoch": 0.361474435196195,
1087
+ "grad_norm": 1.098143269144179,
1088
  "learning_rate": 1.9770727719432994e-05,
1089
+ "loss": 2.1425,
1090
  "step": 152
1091
  },
1092
  {
1093
  "epoch": 0.3638525564803805,
1094
+ "grad_norm": 1.1908904777112574,
1095
  "learning_rate": 1.9766631363797852e-05,
1096
+ "loss": 2.2516,
1097
  "step": 153
1098
  },
1099
  {
1100
  "epoch": 0.36623067776456597,
1101
+ "grad_norm": 1.1823343263781934,
1102
  "learning_rate": 1.9762499169094288e-05,
1103
+ "loss": 2.0991,
1104
  "step": 154
1105
  },
1106
  {
1107
  "epoch": 0.3686087990487515,
1108
+ "grad_norm": 1.1543274307271654,
1109
  "learning_rate": 1.9758331150485576e-05,
1110
+ "loss": 2.2917,
1111
  "step": 155
1112
  },
1113
  {
1114
  "epoch": 0.37098692033293695,
1115
+ "grad_norm": 1.1828452156246019,
1116
  "learning_rate": 1.9754127323266426e-05,
1117
+ "loss": 2.3577,
1118
  "step": 156
1119
  },
1120
  {
1121
  "epoch": 0.3733650416171225,
1122
+ "grad_norm": 1.2458434785978698,
1123
  "learning_rate": 1.9749887702862972e-05,
1124
+ "loss": 2.2291,
1125
  "step": 157
1126
  },
1127
  {
1128
  "epoch": 0.375743162901308,
1129
+ "grad_norm": 1.0632348458757013,
1130
  "learning_rate": 1.9745612304832672e-05,
1131
+ "loss": 2.495,
1132
  "step": 158
1133
  },
1134
  {
1135
  "epoch": 0.37812128418549346,
1136
+ "grad_norm": 1.2413557275846534,
1137
  "learning_rate": 1.9741301144864284e-05,
1138
+ "loss": 2.3006,
1139
  "step": 159
1140
  },
1141
  {
1142
  "epoch": 0.380499405469679,
1143
+ "grad_norm": 1.068837985332943,
1144
  "learning_rate": 1.9736954238777793e-05,
1145
+ "loss": 2.2228,
1146
  "step": 160
1147
  },
1148
  {
1149
  "epoch": 0.38287752675386444,
1150
+ "grad_norm": 1.181973772137545,
1151
  "learning_rate": 1.9732571602524353e-05,
1152
+ "loss": 2.3419,
1153
  "step": 161
1154
  },
1155
  {
1156
  "epoch": 0.38525564803804996,
1157
+ "grad_norm": 0.9361759344356807,
1158
  "learning_rate": 1.972815325218624e-05,
1159
+ "loss": 2.2727,
1160
  "step": 162
1161
  },
1162
  {
1163
  "epoch": 0.3876337693222354,
1164
+ "grad_norm": 1.2300672941710984,
1165
  "learning_rate": 1.9723699203976768e-05,
1166
+ "loss": 2.3947,
1167
  "step": 163
1168
  },
1169
  {
1170
  "epoch": 0.39001189060642094,
1171
+ "grad_norm": 0.9647921025871186,
1172
  "learning_rate": 1.9719209474240263e-05,
1173
+ "loss": 1.8388,
1174
  "step": 164
1175
  },
1176
  {
1177
  "epoch": 0.3923900118906064,
1178
+ "grad_norm": 1.1390311715526416,
1179
  "learning_rate": 1.971468407945198e-05,
1180
+ "loss": 2.4054,
1181
  "step": 165
1182
  },
1183
  {
1184
  "epoch": 0.3947681331747919,
1185
+ "grad_norm": 0.9830051867519547,
1186
  "learning_rate": 1.9710123036218044e-05,
1187
+ "loss": 2.0355,
1188
  "step": 166
1189
  },
1190
  {
1191
  "epoch": 0.3971462544589774,
1192
+ "grad_norm": 1.1244517585073737,
1193
  "learning_rate": 1.97055263612754e-05,
1194
+ "loss": 2.0188,
1195
  "step": 167
1196
  },
1197
  {
1198
  "epoch": 0.3995243757431629,
1199
+ "grad_norm": 1.0256020852263494,
1200
  "learning_rate": 1.9700894071491736e-05,
1201
+ "loss": 2.0774,
1202
  "step": 168
1203
  },
1204
  {
1205
  "epoch": 0.40190249702734837,
1206
+ "grad_norm": 1.011023720252716,
1207
  "learning_rate": 1.9696226183865436e-05,
1208
+ "loss": 2.2592,
1209
  "step": 169
1210
  },
1211
  {
1212
  "epoch": 0.4042806183115339,
1213
+ "grad_norm": 1.046975898884085,
1214
  "learning_rate": 1.969152271552552e-05,
1215
+ "loss": 2.1791,
1216
  "step": 170
1217
  },
1218
  {
1219
  "epoch": 0.40665873959571935,
1220
+ "grad_norm": 1.1800984480399852,
1221
  "learning_rate": 1.9686783683731557e-05,
1222
+ "loss": 2.3941,
1223
  "step": 171
1224
  },
1225
  {
1226
  "epoch": 0.4090368608799049,
1227
+ "grad_norm": 1.2459882622321672,
1228
  "learning_rate": 1.9682009105873633e-05,
1229
+ "loss": 2.1522,
1230
  "step": 172
1231
  },
1232
  {
1233
  "epoch": 0.4114149821640904,
1234
+ "grad_norm": 1.0732133381850257,
1235
  "learning_rate": 1.9677198999472257e-05,
1236
+ "loss": 2.1233,
1237
  "step": 173
1238
  },
1239
  {
1240
  "epoch": 0.41379310344827586,
1241
+ "grad_norm": 1.2405484917580802,
1242
  "learning_rate": 1.967235338217832e-05,
1243
+ "loss": 2.3016,
1244
  "step": 174
1245
  },
1246
  {
1247
  "epoch": 0.4161712247324614,
1248
+ "grad_norm": 1.0759940201219593,
1249
  "learning_rate": 1.9667472271773026e-05,
1250
+ "loss": 2.2947,
1251
  "step": 175
1252
  },
1253
  {
1254
  "epoch": 0.41854934601664684,
1255
+ "grad_norm": 1.2008734320661734,
1256
  "learning_rate": 1.9662555686167808e-05,
1257
+ "loss": 2.2155,
1258
  "step": 176
1259
  },
1260
  {
1261
  "epoch": 0.42092746730083236,
1262
+ "grad_norm": 0.9303619935178572,
1263
  "learning_rate": 1.965760364340429e-05,
1264
+ "loss": 2.1234,
1265
  "step": 177
1266
  },
1267
  {
1268
  "epoch": 0.4233055885850178,
1269
+ "grad_norm": 1.3884826767438652,
1270
  "learning_rate": 1.9652616161654204e-05,
1271
+ "loss": 2.2539,
1272
  "step": 178
1273
  },
1274
  {
1275
  "epoch": 0.42568370986920334,
1276
+ "grad_norm": 0.9947187673832885,
1277
  "learning_rate": 1.9647593259219328e-05,
1278
+ "loss": 2.2052,
1279
  "step": 179
1280
  },
1281
  {
1282
  "epoch": 0.4280618311533888,
1283
+ "grad_norm": 1.4655922792083054,
1284
  "learning_rate": 1.964253495453141e-05,
1285
+ "loss": 2.1552,
1286
  "step": 180
1287
  },
1288
  {
1289
  "epoch": 0.4304399524375743,
1290
+ "grad_norm": 1.1481294188693778,
1291
  "learning_rate": 1.963744126615212e-05,
1292
+ "loss": 2.3942,
1293
  "step": 181
1294
  },
1295
  {
1296
  "epoch": 0.4328180737217598,
1297
+ "grad_norm": 1.239760521409481,
1298
  "learning_rate": 1.9632312212772956e-05,
1299
+ "loss": 2.3091,
1300
  "step": 182
1301
  },
1302
  {
1303
  "epoch": 0.4351961950059453,
1304
+ "grad_norm": 1.0524654460411744,
1305
  "learning_rate": 1.9627147813215207e-05,
1306
+ "loss": 2.302,
1307
  "step": 183
1308
  },
1309
  {
1310
  "epoch": 0.43757431629013077,
1311
+ "grad_norm": 1.0231645108607732,
1312
  "learning_rate": 1.9621948086429847e-05,
1313
+ "loss": 2.2334,
1314
  "step": 184
1315
  },
1316
  {
1317
  "epoch": 0.4399524375743163,
1318
+ "grad_norm": 1.0600582051447691,
1319
  "learning_rate": 1.9616713051497496e-05,
1320
+ "loss": 2.2044,
1321
  "step": 185
1322
  },
1323
  {
1324
  "epoch": 0.4423305588585018,
1325
+ "grad_norm": 1.0861978175484295,
1326
  "learning_rate": 1.9611442727628344e-05,
1327
+ "loss": 2.3267,
1328
  "step": 186
1329
  },
1330
  {
1331
  "epoch": 0.4447086801426873,
1332
+ "grad_norm": 1.0122924353396487,
1333
  "learning_rate": 1.960613713416206e-05,
1334
+ "loss": 2.2327,
1335
  "step": 187
1336
  },
1337
  {
1338
  "epoch": 0.4470868014268728,
1339
+ "grad_norm": 1.1275635495135592,
1340
  "learning_rate": 1.9600796290567747e-05,
1341
+ "loss": 2.2474,
1342
  "step": 188
1343
  },
1344
  {
1345
  "epoch": 0.44946492271105826,
1346
+ "grad_norm": 1.0778906611663819,
1347
  "learning_rate": 1.9595420216443864e-05,
1348
+ "loss": 2.2777,
1349
  "step": 189
1350
  },
1351
  {
1352
  "epoch": 0.4518430439952438,
1353
+ "grad_norm": 1.0593499669893551,
1354
  "learning_rate": 1.9590008931518133e-05,
1355
+ "loss": 2.4937,
1356
  "step": 190
1357
  },
1358
  {
1359
  "epoch": 0.45422116527942924,
1360
+ "grad_norm": 1.0887914371115388,
1361
  "learning_rate": 1.9584562455647494e-05,
1362
+ "loss": 2.2577,
1363
  "step": 191
1364
  },
1365
  {
1366
  "epoch": 0.45659928656361476,
1367
+ "grad_norm": 1.0280779311785984,
1368
  "learning_rate": 1.9579080808818035e-05,
1369
+ "loss": 2.2352,
1370
  "step": 192
1371
  },
1372
  {
1373
  "epoch": 0.4589774078478002,
1374
+ "grad_norm": 1.1201705856067985,
1375
  "learning_rate": 1.9573564011144873e-05,
1376
+ "loss": 2.1482,
1377
  "step": 193
1378
  },
1379
  {
1380
  "epoch": 0.46135552913198574,
1381
+ "grad_norm": 1.0039435227655624,
1382
  "learning_rate": 1.9568012082872148e-05,
1383
+ "loss": 2.1069,
1384
  "step": 194
1385
  },
1386
  {
1387
  "epoch": 0.4637336504161712,
1388
+ "grad_norm": 1.0523831000821406,
1389
  "learning_rate": 1.9562425044372884e-05,
1390
+ "loss": 1.9268,
1391
  "step": 195
1392
  },
1393
  {
1394
  "epoch": 0.4661117717003567,
1395
+ "grad_norm": 1.0635880350342213,
1396
  "learning_rate": 1.9556802916148963e-05,
1397
+ "loss": 2.2722,
1398
  "step": 196
1399
  },
1400
  {
1401
  "epoch": 0.4684898929845422,
1402
+ "grad_norm": 2.4351848601787287,
1403
  "learning_rate": 1.955114571883102e-05,
1404
+ "loss": 2.1402,
1405
  "step": 197
1406
  },
1407
  {
1408
  "epoch": 0.4708680142687277,
1409
+ "grad_norm": 1.2199308274597462,
1410
  "learning_rate": 1.9545453473178384e-05,
1411
+ "loss": 2.2599,
1412
  "step": 198
1413
  },
1414
  {
1415
  "epoch": 0.47324613555291317,
1416
+ "grad_norm": 0.9936114796299212,
1417
  "learning_rate": 1.9539726200078987e-05,
1418
+ "loss": 2.0662,
1419
  "step": 199
1420
  },
1421
  {
1422
  "epoch": 0.4756242568370987,
1423
+ "grad_norm": 1.0692703333507547,
1424
  "learning_rate": 1.9533963920549307e-05,
1425
+ "loss": 2.3739,
1426
  "step": 200
1427
  },
1428
  {
1429
  "epoch": 0.4780023781212842,
1430
+ "grad_norm": 1.0406002686664542,
1431
  "learning_rate": 1.9528166655734267e-05,
1432
+ "loss": 2.3611,
1433
  "step": 201
1434
  },
1435
  {
1436
  "epoch": 0.4803804994054697,
1437
+ "grad_norm": 1.9375905536343168,
1438
  "learning_rate": 1.9522334426907185e-05,
1439
+ "loss": 2.0971,
1440
  "step": 202
1441
  },
1442
  {
1443
  "epoch": 0.4827586206896552,
1444
+ "grad_norm": 1.024548704059581,
1445
  "learning_rate": 1.951646725546966e-05,
1446
+ "loss": 2.2498,
1447
  "step": 203
1448
  },
1449
  {
1450
  "epoch": 0.48513674197384066,
1451
+ "grad_norm": 1.0033895284405978,
1452
  "learning_rate": 1.9510565162951538e-05,
1453
+ "loss": 2.299,
1454
  "step": 204
1455
  },
1456
  {
1457
  "epoch": 0.4875148632580262,
1458
+ "grad_norm": 1.541631519071697,
1459
  "learning_rate": 1.950462817101079e-05,
1460
+ "loss": 2.4076,
1461
  "step": 205
1462
  },
1463
  {
1464
  "epoch": 0.48989298454221164,
1465
+ "grad_norm": 0.9499702987331401,
1466
  "learning_rate": 1.9498656301433466e-05,
1467
+ "loss": 2.0754,
1468
  "step": 206
1469
  },
1470
  {
1471
  "epoch": 0.49227110582639716,
1472
+ "grad_norm": 1.099383371761328,
1473
  "learning_rate": 1.9492649576133594e-05,
1474
+ "loss": 2.2514,
1475
  "step": 207
1476
  },
1477
  {
1478
  "epoch": 0.4946492271105826,
1479
+ "grad_norm": 0.9296431838496088,
1480
  "learning_rate": 1.94866080171531e-05,
1481
+ "loss": 2.2308,
1482
  "step": 208
1483
  },
1484
  {
1485
  "epoch": 0.49702734839476814,
1486
+ "grad_norm": 4.140796209905845,
1487
  "learning_rate": 1.9480531646661753e-05,
1488
+ "loss": 2.4388,
1489
  "step": 209
1490
  },
1491
  {
1492
  "epoch": 0.4994054696789536,
1493
+ "grad_norm": 1.011142238194789,
1494
  "learning_rate": 1.9474420486957045e-05,
1495
+ "loss": 2.2414,
1496
  "step": 210
1497
  },
1498
  {
1499
  "epoch": 0.4994054696789536,
1500
+ "eval_loss": 2.49302339553833,
1501
+ "eval_runtime": 65.8636,
1502
+ "eval_samples_per_second": 9.702,
1503
+ "eval_steps_per_second": 1.215,
1504
  "step": 210
1505
  }
1506
  ],
 
1521
  "attributes": {}
1522
  }
1523
  },
1524
+ "total_flos": 1.3027116576669696e+17,
1525
  "train_batch_size": 1,
1526
  "trial_name": null,
1527
  "trial_params": null
checkpoint-210/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:14afcc01255b192ff7972e2f101184c05665fd6b6b6ac8aba6cded13c143de1c
3
  size 8504
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c4afa27d1a40ecd661f9724785c2e34f68177e49c01784ff719ada5cf02a780
3
  size 8504