fengyao1909 commited on
Commit
f0e1adc
·
verified ·
1 Parent(s): 9eebd67

Upload folder using huggingface_hub

Browse files
model-00001-of-00013.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2cb82b1e4227f90eb539df67a23906a0d386529bd90e4416518981a939d50df4
3
  size 4997184968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb6ec89beb1a3a15c3c323a78468a011a90c4d419a710ded43d64b96710ebcb2
3
  size 4997184968
model-00002-of-00013.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:165c9b97ecf4122feed064367305d6b4ee747f28ff4fc7e8d94e393844c8fee0
3
  size 4997741608
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97e488192375326f97a9d4de1661b5f2853ced9541b05bc9c6bd6fa171162987
3
  size 4997741608
model-00003-of-00013.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:21c28227bd44c09f75a1cc23b6223b02c78147ebfe2c9d137f0f3f06257be640
3
  size 4997742208
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ee1d363d5ff4cb23949928657ea12ac91b6a8573bfd7c67209d8226028c3212
3
  size 4997742208
model-00004-of-00013.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fce7c4b609b2199ba4bc2760712d703c3f2a4560ae5c3d32282c20d2abd1c67b
3
  size 4997743184
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40ecd2225d04967348e873332b3cb64359bdf5c9788e9a125e97c2afc6f882f3
3
  size 4997743184
model-00005-of-00013.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:308943706770822f3f0e3c55998232a8c0632e9d10eda02c9106273735d1b726
3
  size 4997743184
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5ac201b1d37785ad89ec4b990f9ca824eb2e152daf0800465436e872436d389
3
  size 4997743184
model-00006-of-00013.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:adb6b63d7f05f576a681ed78e82162d3b8416dfbb288ae68e3f68b1dd6a8ea2f
3
  size 4997743184
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dfd04f2ab3f0732fafb9223fe3e2edab05b433aa9a176ab5e509e7e48d7f08c3
3
  size 4997743184
model-00007-of-00013.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e6c55451276d19052507d488e74846ea84bd17a3ea982b7e1c8c5d4502b120fe
3
  size 4997743184
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d49269020f5db6a3a2867c0a983dc6aee5f2bcd861fd199c5f756b3a21e14f0
3
  size 4997743184
model-00008-of-00013.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:493720f40234c7d1c82dded0266793cc1d77bfe3d3cf196ba6d16dc197a4330a
3
  size 4997743184
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd04590d86a5115cc651443381bdbb8b13aaa6543361ff4aacea69fb3d6bb106
3
  size 4997743184
model-00009-of-00013.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:24f9ebf5e271e1b068f8fbf4312a124a5855857286fd063e78a25c6c46f735d9
3
  size 4997743184
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d1b44e9a3b53b1e8d17806d67cd92a1c243f5a9b0a5f4067b06c3e056d7fa92
3
  size 4997743184
model-00010-of-00013.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d0fc67b140de70461152ffe0d1d4f39d610273d498a180520eb93519a9578a1f
3
  size 4997743184
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eed5e6c59693fb4545ef1a751ae89f1d259d017cc1c0f4c16a1724298c5ebbbe
3
  size 4997743184
model-00011-of-00013.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c15508f97bd308053a4316b31a4eac5c24e60a9e1df4d762ebfaac10e9e0ed58
3
  size 4997743184
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f15933c6130e6814eb07617d25211181c67a1da13e6b620ed4b67f6c1c78925f
3
  size 4997743184
model-00012-of-00013.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:da7e4a9da619d34efd17daed024a177a1251ff995b46630f755b6699327d912d
3
  size 4997743184
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63fca5cc1b8a5644c2a7238112ed7c1396ae81b26286dd4ca7d7fa8d7049dd24
3
  size 4997743184
model-00013-of-00013.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6f27829fb905e82ca6602bfe1c29fed13cf8084e97dab950a5ccc8b35ef990aa
3
  size 1094220288
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:486407d221e32bbb0ebeeb7e39c521de9120456af2d0a722eca139320d7e1113
3
  size 1094220288
trainer_state.json CHANGED
@@ -11,800 +11,800 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.017543859649122806,
14
- "grad_norm": 1.8317057887019643,
15
  "learning_rate": 0.0,
16
  "loss": 0.8483,
17
  "step": 1
18
  },
19
  {
20
  "epoch": 0.03508771929824561,
21
- "grad_norm": 1.7353440418305668,
22
  "learning_rate": 3.3333333333333333e-06,
23
  "loss": 0.8763,
24
  "step": 2
25
  },
26
  {
27
  "epoch": 0.05263157894736842,
28
- "grad_norm": 1.9110385217084043,
29
  "learning_rate": 6.666666666666667e-06,
30
- "loss": 1.0195,
31
  "step": 3
32
  },
33
  {
34
  "epoch": 0.07017543859649122,
35
- "grad_norm": 1.650455460756132,
36
  "learning_rate": 1e-05,
37
- "loss": 1.049,
38
  "step": 4
39
  },
40
  {
41
  "epoch": 0.08771929824561403,
42
- "grad_norm": 0.9846115915202341,
43
  "learning_rate": 1.3333333333333333e-05,
44
- "loss": 0.8378,
45
  "step": 5
46
  },
47
  {
48
  "epoch": 0.10526315789473684,
49
- "grad_norm": 1.2896673412488742,
50
  "learning_rate": 1.6666666666666667e-05,
51
- "loss": 0.9736,
52
  "step": 6
53
  },
54
  {
55
  "epoch": 0.12280701754385964,
56
- "grad_norm": 2.011543412600694,
57
  "learning_rate": 2e-05,
58
- "loss": 0.9678,
59
  "step": 7
60
  },
61
  {
62
  "epoch": 0.14035087719298245,
63
- "grad_norm": 1.6250329547144573,
64
  "learning_rate": 2.3333333333333336e-05,
65
- "loss": 0.9089,
66
  "step": 8
67
  },
68
  {
69
  "epoch": 0.15789473684210525,
70
- "grad_norm": 1.6751272065587126,
71
  "learning_rate": 2.6666666666666667e-05,
72
- "loss": 0.9359,
73
  "step": 9
74
  },
75
  {
76
  "epoch": 0.17543859649122806,
77
- "grad_norm": 1.4234656524715992,
78
  "learning_rate": 3e-05,
79
- "loss": 0.9585,
80
  "step": 10
81
  },
82
  {
83
  "epoch": 0.19298245614035087,
84
- "grad_norm": 1.4638280218342359,
85
  "learning_rate": 3.3333333333333335e-05,
86
- "loss": 1.0041,
87
  "step": 11
88
  },
89
  {
90
  "epoch": 0.21052631578947367,
91
- "grad_norm": 1.0419301809036907,
92
  "learning_rate": 3.6666666666666666e-05,
93
- "loss": 0.8763,
94
  "step": 12
95
  },
96
  {
97
  "epoch": 0.22807017543859648,
98
- "grad_norm": 1.198346639614849,
99
  "learning_rate": 4e-05,
100
- "loss": 0.9129,
101
  "step": 13
102
  },
103
  {
104
  "epoch": 0.24561403508771928,
105
- "grad_norm": 1.0034373969603534,
106
  "learning_rate": 4.3333333333333334e-05,
107
- "loss": 0.8965,
108
  "step": 14
109
  },
110
  {
111
  "epoch": 0.2631578947368421,
112
- "grad_norm": 1.0380657919294447,
113
  "learning_rate": 4.666666666666667e-05,
114
- "loss": 0.957,
115
  "step": 15
116
  },
117
  {
118
  "epoch": 0.2807017543859649,
119
- "grad_norm": 0.9533939175115029,
120
  "learning_rate": 5e-05,
121
- "loss": 0.91,
122
  "step": 16
123
  },
124
  {
125
  "epoch": 0.2982456140350877,
126
- "grad_norm": 1.2098882485474858,
127
  "learning_rate": 4.999830770009406e-05,
128
- "loss": 1.0061,
129
  "step": 17
130
  },
131
  {
132
  "epoch": 0.3157894736842105,
133
- "grad_norm": 0.9892702554456709,
134
  "learning_rate": 4.9993231029486544e-05,
135
- "loss": 0.8912,
136
  "step": 18
137
  },
138
  {
139
  "epoch": 0.3333333333333333,
140
- "grad_norm": 0.715562209343003,
141
  "learning_rate": 4.99847706754774e-05,
142
- "loss": 0.9125,
143
  "step": 19
144
  },
145
  {
146
  "epoch": 0.3508771929824561,
147
- "grad_norm": 0.8839740963896582,
148
  "learning_rate": 4.997292778346312e-05,
149
- "loss": 0.9544,
150
  "step": 20
151
  },
152
  {
153
  "epoch": 0.3684210526315789,
154
- "grad_norm": 0.8672090781221823,
155
  "learning_rate": 4.995770395678171e-05,
156
- "loss": 0.8931,
157
  "step": 21
158
  },
159
  {
160
  "epoch": 0.38596491228070173,
161
- "grad_norm": 0.5657183889732781,
162
  "learning_rate": 4.993910125649561e-05,
163
- "loss": 0.7591,
164
  "step": 22
165
  },
166
  {
167
  "epoch": 0.40350877192982454,
168
- "grad_norm": 0.8395194599838961,
169
  "learning_rate": 4.9917122201112656e-05,
170
- "loss": 0.8958,
171
  "step": 23
172
  },
173
  {
174
  "epoch": 0.42105263157894735,
175
- "grad_norm": 0.7949156783532444,
176
  "learning_rate": 4.989176976624511e-05,
177
- "loss": 0.7957,
178
  "step": 24
179
  },
180
  {
181
  "epoch": 0.43859649122807015,
182
- "grad_norm": 0.6233078811754251,
183
  "learning_rate": 4.9863047384206835e-05,
184
- "loss": 0.9287,
185
  "step": 25
186
  },
187
  {
188
  "epoch": 0.45614035087719296,
189
- "grad_norm": 0.6592512357205284,
190
  "learning_rate": 4.983095894354858e-05,
191
- "loss": 0.8961,
192
  "step": 26
193
  },
194
  {
195
  "epoch": 0.47368421052631576,
196
- "grad_norm": 0.7540428801849153,
197
  "learning_rate": 4.979550878853154e-05,
198
- "loss": 0.8113,
199
  "step": 27
200
  },
201
  {
202
  "epoch": 0.49122807017543857,
203
- "grad_norm": 0.6894407810210612,
204
  "learning_rate": 4.975670171853926e-05,
205
- "loss": 0.8837,
206
  "step": 28
207
  },
208
  {
209
  "epoch": 0.5087719298245614,
210
- "grad_norm": 0.5962438335142802,
211
  "learning_rate": 4.971454298742779e-05,
212
- "loss": 0.9039,
213
  "step": 29
214
  },
215
  {
216
  "epoch": 0.5263157894736842,
217
- "grad_norm": 0.6159211013371156,
218
  "learning_rate": 4.966903830281449e-05,
219
- "loss": 0.7894,
220
  "step": 30
221
  },
222
  {
223
  "epoch": 0.543859649122807,
224
- "grad_norm": 0.7929356664184409,
225
  "learning_rate": 4.962019382530521e-05,
226
- "loss": 0.8463,
227
  "step": 31
228
  },
229
  {
230
  "epoch": 0.5614035087719298,
231
- "grad_norm": 0.5952896775612254,
232
  "learning_rate": 4.9568016167660334e-05,
233
- "loss": 0.7816,
234
  "step": 32
235
  },
236
  {
237
  "epoch": 0.5789473684210527,
238
- "grad_norm": 0.6758915434113398,
239
  "learning_rate": 4.951251239389948e-05,
240
- "loss": 0.8776,
241
  "step": 33
242
  },
243
  {
244
  "epoch": 0.5964912280701754,
245
- "grad_norm": 0.6318291971598253,
246
  "learning_rate": 4.9453690018345144e-05,
247
- "loss": 0.9796,
248
  "step": 34
249
  },
250
  {
251
  "epoch": 0.6140350877192983,
252
- "grad_norm": 0.7115801773839961,
253
  "learning_rate": 4.939155700460536e-05,
254
- "loss": 0.8228,
255
  "step": 35
256
  },
257
  {
258
  "epoch": 0.631578947368421,
259
- "grad_norm": 0.5864293536264706,
260
  "learning_rate": 4.9326121764495596e-05,
261
- "loss": 0.839,
262
  "step": 36
263
  },
264
  {
265
  "epoch": 0.6491228070175439,
266
- "grad_norm": 0.6287778513114669,
267
  "learning_rate": 4.925739315689991e-05,
268
- "loss": 0.8778,
269
  "step": 37
270
  },
271
  {
272
  "epoch": 0.6666666666666666,
273
- "grad_norm": 0.6735952098345708,
274
  "learning_rate": 4.9185380486571595e-05,
275
- "loss": 0.885,
276
  "step": 38
277
  },
278
  {
279
  "epoch": 0.6842105263157895,
280
- "grad_norm": 0.6067535888888641,
281
  "learning_rate": 4.9110093502873476e-05,
282
- "loss": 0.9082,
283
  "step": 39
284
  },
285
  {
286
  "epoch": 0.7017543859649122,
287
- "grad_norm": 0.5247508841910119,
288
  "learning_rate": 4.9031542398457974e-05,
289
- "loss": 0.8739,
290
  "step": 40
291
  },
292
  {
293
  "epoch": 0.7192982456140351,
294
- "grad_norm": 0.5918453184670143,
295
  "learning_rate": 4.894973780788722e-05,
296
- "loss": 0.8829,
297
  "step": 41
298
  },
299
  {
300
  "epoch": 0.7368421052631579,
301
- "grad_norm": 0.6372269077212606,
302
  "learning_rate": 4.88646908061933e-05,
303
- "loss": 0.8152,
304
  "step": 42
305
  },
306
  {
307
  "epoch": 0.7543859649122807,
308
- "grad_norm": 0.5430663295315261,
309
  "learning_rate": 4.877641290737884e-05,
310
- "loss": 0.8514,
311
  "step": 43
312
  },
313
  {
314
  "epoch": 0.7719298245614035,
315
- "grad_norm": 0.511597332007924,
316
  "learning_rate": 4.868491606285823e-05,
317
- "loss": 0.7675,
318
  "step": 44
319
  },
320
  {
321
  "epoch": 0.7894736842105263,
322
- "grad_norm": 0.48967659833861754,
323
  "learning_rate": 4.859021265983959e-05,
324
- "loss": 0.8826,
325
  "step": 45
326
  },
327
  {
328
  "epoch": 0.8070175438596491,
329
- "grad_norm": 0.5856811530030098,
330
  "learning_rate": 4.849231551964771e-05,
331
- "loss": 0.8585,
332
  "step": 46
333
  },
334
  {
335
  "epoch": 0.8245614035087719,
336
- "grad_norm": 0.5845644414839125,
337
  "learning_rate": 4.839123789598829e-05,
338
- "loss": 0.8677,
339
  "step": 47
340
  },
341
  {
342
  "epoch": 0.8421052631578947,
343
- "grad_norm": 0.4974332648938187,
344
  "learning_rate": 4.828699347315356e-05,
345
- "loss": 0.845,
346
  "step": 48
347
  },
348
  {
349
  "epoch": 0.8596491228070176,
350
- "grad_norm": 0.5566174918152419,
351
  "learning_rate": 4.817959636416969e-05,
352
- "loss": 0.9432,
353
  "step": 49
354
  },
355
  {
356
  "epoch": 0.8771929824561403,
357
- "grad_norm": 0.4921351014911049,
358
  "learning_rate": 4.806906110888606e-05,
359
- "loss": 0.6756,
360
  "step": 50
361
  },
362
  {
363
  "epoch": 0.8947368421052632,
364
- "grad_norm": 0.6352002986003276,
365
  "learning_rate": 4.7955402672006854e-05,
366
- "loss": 0.9077,
367
  "step": 51
368
  },
369
  {
370
  "epoch": 0.9122807017543859,
371
- "grad_norm": 0.5228777570085404,
372
  "learning_rate": 4.783863644106502e-05,
373
- "loss": 0.8815,
374
  "step": 52
375
  },
376
  {
377
  "epoch": 0.9298245614035088,
378
- "grad_norm": 0.5753314530576694,
379
  "learning_rate": 4.771877822433911e-05,
380
- "loss": 0.8342,
381
  "step": 53
382
  },
383
  {
384
  "epoch": 0.9473684210526315,
385
- "grad_norm": 0.5127814989448164,
386
  "learning_rate": 4.759584424871302e-05,
387
- "loss": 0.7837,
388
  "step": 54
389
  },
390
  {
391
  "epoch": 0.9649122807017544,
392
- "grad_norm": 0.545931046230859,
393
  "learning_rate": 4.7469851157479177e-05,
394
- "loss": 0.8229,
395
  "step": 55
396
  },
397
  {
398
  "epoch": 0.9824561403508771,
399
- "grad_norm": 0.4669374807712089,
400
  "learning_rate": 4.734081600808531e-05,
401
- "loss": 0.8341,
402
  "step": 56
403
  },
404
  {
405
  "epoch": 1.0,
406
- "grad_norm": 0.5386650743226552,
407
  "learning_rate": 4.7208756269825104e-05,
408
- "loss": 0.7062,
409
  "step": 57
410
  },
411
  {
412
  "epoch": 1.0175438596491229,
413
- "grad_norm": 0.6793013357179898,
414
  "learning_rate": 4.707368982147318e-05,
415
- "loss": 0.7762,
416
  "step": 58
417
  },
418
  {
419
  "epoch": 1.0350877192982457,
420
- "grad_norm": 0.627446069759414,
421
  "learning_rate": 4.693563494886455e-05,
422
- "loss": 0.6663,
423
  "step": 59
424
  },
425
  {
426
  "epoch": 1.0526315789473684,
427
- "grad_norm": 0.6603925054843989,
428
  "learning_rate": 4.679461034241906e-05,
429
- "loss": 0.7633,
430
  "step": 60
431
  },
432
  {
433
  "epoch": 1.0701754385964912,
434
- "grad_norm": 0.8788262913431679,
435
  "learning_rate": 4.665063509461097e-05,
436
- "loss": 0.6608,
437
  "step": 61
438
  },
439
  {
440
  "epoch": 1.087719298245614,
441
- "grad_norm": 0.6735739896088276,
442
  "learning_rate": 4.650372869738414e-05,
443
- "loss": 0.7057,
444
  "step": 62
445
  },
446
  {
447
  "epoch": 1.1052631578947367,
448
- "grad_norm": 1.1748331139699308,
449
  "learning_rate": 4.6353911039513145e-05,
450
- "loss": 0.7607,
451
  "step": 63
452
  },
453
  {
454
  "epoch": 1.1228070175438596,
455
- "grad_norm": 0.8665469540278155,
456
  "learning_rate": 4.620120240391065e-05,
457
- "loss": 0.7229,
458
  "step": 64
459
  },
460
  {
461
  "epoch": 1.1403508771929824,
462
- "grad_norm": 0.6395824700230515,
463
  "learning_rate": 4.604562346488144e-05,
464
- "loss": 0.6613,
465
  "step": 65
466
  },
467
  {
468
  "epoch": 1.1578947368421053,
469
- "grad_norm": 0.7075740149066272,
470
  "learning_rate": 4.588719528532342e-05,
471
- "loss": 0.6818,
472
  "step": 66
473
  },
474
  {
475
  "epoch": 1.1754385964912282,
476
- "grad_norm": 0.6512740767179199,
477
  "learning_rate": 4.572593931387604e-05,
478
- "loss": 0.6719,
479
  "step": 67
480
  },
481
  {
482
  "epoch": 1.1929824561403508,
483
- "grad_norm": 0.7391191118171585,
484
  "learning_rate": 4.556187738201656e-05,
485
- "loss": 0.716,
486
  "step": 68
487
  },
488
  {
489
  "epoch": 1.2105263157894737,
490
- "grad_norm": 0.6544655450388867,
491
  "learning_rate": 4.539503170110431e-05,
492
- "loss": 0.7671,
493
  "step": 69
494
  },
495
  {
496
  "epoch": 1.2280701754385965,
497
- "grad_norm": 0.4890450142415007,
498
  "learning_rate": 4.522542485937369e-05,
499
- "loss": 0.7089,
500
  "step": 70
501
  },
502
  {
503
  "epoch": 1.2456140350877192,
504
- "grad_norm": 0.6629174444368172,
505
  "learning_rate": 4.50530798188761e-05,
506
- "loss": 0.7644,
507
  "step": 71
508
  },
509
  {
510
  "epoch": 1.263157894736842,
511
- "grad_norm": 1.313611758541734,
512
  "learning_rate": 4.48780199123712e-05,
513
- "loss": 0.7247,
514
  "step": 72
515
  },
516
  {
517
  "epoch": 1.280701754385965,
518
- "grad_norm": 0.621594494051855,
519
  "learning_rate": 4.4700268840168045e-05,
520
- "loss": 0.7177,
521
  "step": 73
522
  },
523
  {
524
  "epoch": 1.2982456140350878,
525
- "grad_norm": 0.543283643604564,
526
  "learning_rate": 4.4519850666916484e-05,
527
- "loss": 0.6484,
528
  "step": 74
529
  },
530
  {
531
  "epoch": 1.3157894736842106,
532
- "grad_norm": 0.5692124563338323,
533
  "learning_rate": 4.43367898183491e-05,
534
- "loss": 0.7805,
535
  "step": 75
536
  },
537
  {
538
  "epoch": 1.3333333333333333,
539
- "grad_norm": 0.5140372497115965,
540
  "learning_rate": 4.415111107797445e-05,
541
- "loss": 0.6551,
542
  "step": 76
543
  },
544
  {
545
  "epoch": 1.3508771929824561,
546
- "grad_norm": 0.5556993211984754,
547
  "learning_rate": 4.396283958372173e-05,
548
- "loss": 0.6435,
549
  "step": 77
550
  },
551
  {
552
  "epoch": 1.368421052631579,
553
- "grad_norm": 0.6297070634295856,
554
  "learning_rate": 4.377200082453749e-05,
555
- "loss": 0.7551,
556
  "step": 78
557
  },
558
  {
559
  "epoch": 1.3859649122807016,
560
- "grad_norm": 0.5343607416783988,
561
  "learning_rate": 4.357862063693486e-05,
562
- "loss": 0.7401,
563
  "step": 79
564
  },
565
  {
566
  "epoch": 1.4035087719298245,
567
- "grad_norm": 0.6933983776921214,
568
  "learning_rate": 4.3382725201495723e-05,
569
- "loss": 0.7092,
570
  "step": 80
571
  },
572
  {
573
  "epoch": 1.4210526315789473,
574
- "grad_norm": 0.48867547087530555,
575
  "learning_rate": 4.318434103932622e-05,
576
- "loss": 0.7157,
577
  "step": 81
578
  },
579
  {
580
  "epoch": 1.4385964912280702,
581
- "grad_norm": 0.5074429307644182,
582
  "learning_rate": 4.2983495008466276e-05,
583
- "loss": 0.6988,
584
  "step": 82
585
  },
586
  {
587
  "epoch": 1.456140350877193,
588
- "grad_norm": 0.546003726314663,
589
  "learning_rate": 4.278021430025343e-05,
590
- "loss": 0.6233,
591
  "step": 83
592
  },
593
  {
594
  "epoch": 1.4736842105263157,
595
- "grad_norm": 0.46092178005444556,
596
  "learning_rate": 4.257452643564155e-05,
597
- "loss": 0.5562,
598
  "step": 84
599
  },
600
  {
601
  "epoch": 1.4912280701754386,
602
- "grad_norm": 0.5753767938443312,
603
  "learning_rate": 4.2366459261474933e-05,
604
- "loss": 0.7155,
605
  "step": 85
606
  },
607
  {
608
  "epoch": 1.5087719298245614,
609
- "grad_norm": 0.5118274305991928,
610
  "learning_rate": 4.215604094671835e-05,
611
- "loss": 0.6921,
612
  "step": 86
613
  },
614
  {
615
  "epoch": 1.526315789473684,
616
- "grad_norm": 0.6380979260118252,
617
  "learning_rate": 4.194329997864331e-05,
618
- "loss": 0.8256,
619
  "step": 87
620
  },
621
  {
622
  "epoch": 1.543859649122807,
623
- "grad_norm": 0.5435984291436712,
624
  "learning_rate": 4.172826515897146e-05,
625
- "loss": 0.7167,
626
  "step": 88
627
  },
628
  {
629
  "epoch": 1.5614035087719298,
630
- "grad_norm": 0.4654905831182726,
631
  "learning_rate": 4.1510965599975196e-05,
632
- "loss": 0.7498,
633
  "step": 89
634
  },
635
  {
636
  "epoch": 1.5789473684210527,
637
- "grad_norm": 0.5655561181667187,
638
  "learning_rate": 4.129143072053638e-05,
639
- "loss": 0.6566,
640
  "step": 90
641
  },
642
  {
643
  "epoch": 1.5964912280701755,
644
- "grad_norm": 0.46666240125611763,
645
  "learning_rate": 4.1069690242163484e-05,
646
- "loss": 0.6798,
647
  "step": 91
648
  },
649
  {
650
  "epoch": 1.6140350877192984,
651
- "grad_norm": 0.45866681537946846,
652
  "learning_rate": 4.0845774184967754e-05,
653
- "loss": 0.702,
654
  "step": 92
655
  },
656
  {
657
  "epoch": 1.631578947368421,
658
- "grad_norm": 0.41578111266812384,
659
  "learning_rate": 4.0619712863599e-05,
660
- "loss": 0.6448,
661
  "step": 93
662
  },
663
  {
664
  "epoch": 1.6491228070175439,
665
- "grad_norm": 0.37343214189013274,
666
  "learning_rate": 4.039153688314145e-05,
667
- "loss": 0.6486,
668
  "step": 94
669
  },
670
  {
671
  "epoch": 1.6666666666666665,
672
- "grad_norm": 0.4534534413589986,
673
  "learning_rate": 4.0161277134970345e-05,
674
- "loss": 0.7514,
675
  "step": 95
676
  },
677
  {
678
  "epoch": 1.6842105263157894,
679
- "grad_norm": 0.5086874484641599,
680
  "learning_rate": 3.9928964792569655e-05,
681
- "loss": 0.6561,
682
  "step": 96
683
  },
684
  {
685
  "epoch": 1.7017543859649122,
686
- "grad_norm": 0.5099982692153668,
687
  "learning_rate": 3.969463130731183e-05,
688
- "loss": 0.818,
689
  "step": 97
690
  },
691
  {
692
  "epoch": 1.719298245614035,
693
- "grad_norm": 0.5239173636107562,
694
  "learning_rate": 3.945830840419966e-05,
695
- "loss": 0.707,
696
  "step": 98
697
  },
698
  {
699
  "epoch": 1.736842105263158,
700
- "grad_norm": 0.5057288153759746,
701
  "learning_rate": 3.9220028077571295e-05,
702
- "loss": 0.7297,
703
  "step": 99
704
  },
705
  {
706
  "epoch": 1.7543859649122808,
707
- "grad_norm": 0.4578235248160848,
708
  "learning_rate": 3.897982258676867e-05,
709
- "loss": 0.6636,
710
  "step": 100
711
  },
712
  {
713
  "epoch": 1.7719298245614035,
714
- "grad_norm": 0.4741202765186654,
715
  "learning_rate": 3.873772445177015e-05,
716
- "loss": 0.7528,
717
  "step": 101
718
  },
719
  {
720
  "epoch": 1.7894736842105263,
721
- "grad_norm": 0.4039174802249478,
722
  "learning_rate": 3.8493766448787825e-05,
723
- "loss": 0.7435,
724
  "step": 102
725
  },
726
  {
727
  "epoch": 1.807017543859649,
728
- "grad_norm": 0.589597565267169,
729
  "learning_rate": 3.824798160583012e-05,
730
- "loss": 0.7506,
731
  "step": 103
732
  },
733
  {
734
  "epoch": 1.8245614035087718,
735
- "grad_norm": 0.44486881957400587,
736
  "learning_rate": 3.8000403198230387e-05,
737
- "loss": 0.6197,
738
  "step": 104
739
  },
740
  {
741
  "epoch": 1.8421052631578947,
742
- "grad_norm": 0.4171720478200468,
743
  "learning_rate": 3.775106474414188e-05,
744
- "loss": 0.5865,
745
  "step": 105
746
  },
747
  {
748
  "epoch": 1.8596491228070176,
749
- "grad_norm": 0.6688671369020073,
750
  "learning_rate": 3.7500000000000003e-05,
751
- "loss": 0.8046,
752
  "step": 106
753
  },
754
  {
755
  "epoch": 1.8771929824561404,
756
- "grad_norm": 0.594947999839499,
757
  "learning_rate": 3.7247242955952175e-05,
758
- "loss": 0.7218,
759
  "step": 107
760
  },
761
  {
762
  "epoch": 1.8947368421052633,
763
- "grad_norm": 0.4541655734242718,
764
  "learning_rate": 3.699282783125616e-05,
765
- "loss": 0.7191,
766
  "step": 108
767
  },
768
  {
769
  "epoch": 1.912280701754386,
770
- "grad_norm": 0.44254870382471806,
771
  "learning_rate": 3.673678906964727e-05,
772
- "loss": 0.6318,
773
  "step": 109
774
  },
775
  {
776
  "epoch": 1.9298245614035088,
777
- "grad_norm": 0.5177824211881119,
778
  "learning_rate": 3.6479161334675296e-05,
779
- "loss": 0.7521,
780
  "step": 110
781
  },
782
  {
783
  "epoch": 1.9473684210526314,
784
- "grad_norm": 0.5109935026690099,
785
  "learning_rate": 3.621997950501156e-05,
786
- "loss": 0.7555,
787
  "step": 111
788
  },
789
  {
790
  "epoch": 1.9649122807017543,
791
- "grad_norm": 0.6025137993464594,
792
  "learning_rate": 3.5959278669726935e-05,
793
- "loss": 0.7303,
794
  "step": 112
795
  },
796
  {
797
  "epoch": 1.9824561403508771,
798
- "grad_norm": 0.45225891946430735,
799
  "learning_rate": 3.569709412354136e-05,
800
- "loss": 0.7882,
801
  "step": 113
802
  },
803
  {
804
  "epoch": 2.0,
805
- "grad_norm": 0.613838203258988,
806
  "learning_rate": 3.543346136204545e-05,
807
- "loss": 0.6483,
808
  "step": 114
809
  }
810
  ],
 
11
  "log_history": [
12
  {
13
  "epoch": 0.017543859649122806,
14
+ "grad_norm": 2.410881066137832,
15
  "learning_rate": 0.0,
16
  "loss": 0.8483,
17
  "step": 1
18
  },
19
  {
20
  "epoch": 0.03508771929824561,
21
+ "grad_norm": 2.665228254124179,
22
  "learning_rate": 3.3333333333333333e-06,
23
  "loss": 0.8763,
24
  "step": 2
25
  },
26
  {
27
  "epoch": 0.05263157894736842,
28
+ "grad_norm": 2.3728587057049992,
29
  "learning_rate": 6.666666666666667e-06,
30
+ "loss": 1.019,
31
  "step": 3
32
  },
33
  {
34
  "epoch": 0.07017543859649122,
35
+ "grad_norm": 1.9987528382054482,
36
  "learning_rate": 1e-05,
37
+ "loss": 1.0451,
38
  "step": 4
39
  },
40
  {
41
  "epoch": 0.08771929824561403,
42
+ "grad_norm": 1.2387448314677751,
43
  "learning_rate": 1.3333333333333333e-05,
44
+ "loss": 0.8298,
45
  "step": 5
46
  },
47
  {
48
  "epoch": 0.10526315789473684,
49
+ "grad_norm": 1.4006709843865188,
50
  "learning_rate": 1.6666666666666667e-05,
51
+ "loss": 0.9668,
52
  "step": 6
53
  },
54
  {
55
  "epoch": 0.12280701754385964,
56
+ "grad_norm": 2.1941304727738995,
57
  "learning_rate": 2e-05,
58
+ "loss": 0.9639,
59
  "step": 7
60
  },
61
  {
62
  "epoch": 0.14035087719298245,
63
+ "grad_norm": 2.0965952504080145,
64
  "learning_rate": 2.3333333333333336e-05,
65
+ "loss": 0.906,
66
  "step": 8
67
  },
68
  {
69
  "epoch": 0.15789473684210525,
70
+ "grad_norm": 2.0779003780741325,
71
  "learning_rate": 2.6666666666666667e-05,
72
+ "loss": 0.9331,
73
  "step": 9
74
  },
75
  {
76
  "epoch": 0.17543859649122806,
77
+ "grad_norm": 1.914852860029578,
78
  "learning_rate": 3e-05,
79
+ "loss": 0.9615,
80
  "step": 10
81
  },
82
  {
83
  "epoch": 0.19298245614035087,
84
+ "grad_norm": 1.5453059456697873,
85
  "learning_rate": 3.3333333333333335e-05,
86
+ "loss": 0.9976,
87
  "step": 11
88
  },
89
  {
90
  "epoch": 0.21052631578947367,
91
+ "grad_norm": 1.2149278389301041,
92
  "learning_rate": 3.6666666666666666e-05,
93
+ "loss": 0.8731,
94
  "step": 12
95
  },
96
  {
97
  "epoch": 0.22807017543859648,
98
+ "grad_norm": 9.574453131193065,
99
  "learning_rate": 4e-05,
100
+ "loss": 0.9048,
101
  "step": 13
102
  },
103
  {
104
  "epoch": 0.24561403508771928,
105
+ "grad_norm": 1.7542365583306754,
106
  "learning_rate": 4.3333333333333334e-05,
107
+ "loss": 0.9045,
108
  "step": 14
109
  },
110
  {
111
  "epoch": 0.2631578947368421,
112
+ "grad_norm": 1.5687648722942888,
113
  "learning_rate": 4.666666666666667e-05,
114
+ "loss": 0.9642,
115
  "step": 15
116
  },
117
  {
118
  "epoch": 0.2807017543859649,
119
+ "grad_norm": 1.1629052446813282,
120
  "learning_rate": 5e-05,
121
+ "loss": 0.9157,
122
  "step": 16
123
  },
124
  {
125
  "epoch": 0.2982456140350877,
126
+ "grad_norm": 1.0662859779498828,
127
  "learning_rate": 4.999830770009406e-05,
128
+ "loss": 1.0076,
129
  "step": 17
130
  },
131
  {
132
  "epoch": 0.3157894736842105,
133
+ "grad_norm": 1.1540729762939772,
134
  "learning_rate": 4.9993231029486544e-05,
135
+ "loss": 0.8898,
136
  "step": 18
137
  },
138
  {
139
  "epoch": 0.3333333333333333,
140
+ "grad_norm": 2.284686909029557,
141
  "learning_rate": 4.99847706754774e-05,
142
+ "loss": 0.9111,
143
  "step": 19
144
  },
145
  {
146
  "epoch": 0.3508771929824561,
147
+ "grad_norm": 0.9299877447474001,
148
  "learning_rate": 4.997292778346312e-05,
149
+ "loss": 0.9567,
150
  "step": 20
151
  },
152
  {
153
  "epoch": 0.3684210526315789,
154
+ "grad_norm": 0.7891834706045853,
155
  "learning_rate": 4.995770395678171e-05,
156
+ "loss": 0.8907,
157
  "step": 21
158
  },
159
  {
160
  "epoch": 0.38596491228070173,
161
+ "grad_norm": 0.7738739475483105,
162
  "learning_rate": 4.993910125649561e-05,
163
+ "loss": 0.7582,
164
  "step": 22
165
  },
166
  {
167
  "epoch": 0.40350877192982454,
168
+ "grad_norm": 0.7564543923604894,
169
  "learning_rate": 4.9917122201112656e-05,
170
+ "loss": 0.893,
171
  "step": 23
172
  },
173
  {
174
  "epoch": 0.42105263157894735,
175
+ "grad_norm": 0.7725302637179225,
176
  "learning_rate": 4.989176976624511e-05,
177
+ "loss": 0.794,
178
  "step": 24
179
  },
180
  {
181
  "epoch": 0.43859649122807015,
182
+ "grad_norm": 0.7154589746662706,
183
  "learning_rate": 4.9863047384206835e-05,
184
+ "loss": 0.927,
185
  "step": 25
186
  },
187
  {
188
  "epoch": 0.45614035087719296,
189
+ "grad_norm": 0.748200539385192,
190
  "learning_rate": 4.983095894354858e-05,
191
+ "loss": 0.8946,
192
  "step": 26
193
  },
194
  {
195
  "epoch": 0.47368421052631576,
196
+ "grad_norm": 0.6877177188589038,
197
  "learning_rate": 4.979550878853154e-05,
198
+ "loss": 0.8108,
199
  "step": 27
200
  },
201
  {
202
  "epoch": 0.49122807017543857,
203
+ "grad_norm": 0.6905557124467382,
204
  "learning_rate": 4.975670171853926e-05,
205
+ "loss": 0.8808,
206
  "step": 28
207
  },
208
  {
209
  "epoch": 0.5087719298245614,
210
+ "grad_norm": 0.6261501879052016,
211
  "learning_rate": 4.971454298742779e-05,
212
+ "loss": 0.902,
213
  "step": 29
214
  },
215
  {
216
  "epoch": 0.5263157894736842,
217
+ "grad_norm": 0.6935833833701227,
218
  "learning_rate": 4.966903830281449e-05,
219
+ "loss": 0.7902,
220
  "step": 30
221
  },
222
  {
223
  "epoch": 0.543859649122807,
224
+ "grad_norm": 0.7380295796572786,
225
  "learning_rate": 4.962019382530521e-05,
226
+ "loss": 0.8427,
227
  "step": 31
228
  },
229
  {
230
  "epoch": 0.5614035087719298,
231
+ "grad_norm": 0.6686373976763716,
232
  "learning_rate": 4.9568016167660334e-05,
233
+ "loss": 0.7824,
234
  "step": 32
235
  },
236
  {
237
  "epoch": 0.5789473684210527,
238
+ "grad_norm": 0.7284830674570405,
239
  "learning_rate": 4.951251239389948e-05,
240
+ "loss": 0.8758,
241
  "step": 33
242
  },
243
  {
244
  "epoch": 0.5964912280701754,
245
+ "grad_norm": 0.7667345279664498,
246
  "learning_rate": 4.9453690018345144e-05,
247
+ "loss": 0.9769,
248
  "step": 34
249
  },
250
  {
251
  "epoch": 0.6140350877192983,
252
+ "grad_norm": 0.7186885267427574,
253
  "learning_rate": 4.939155700460536e-05,
254
+ "loss": 0.8222,
255
  "step": 35
256
  },
257
  {
258
  "epoch": 0.631578947368421,
259
+ "grad_norm": 0.7129526253734785,
260
  "learning_rate": 4.9326121764495596e-05,
261
+ "loss": 0.8398,
262
  "step": 36
263
  },
264
  {
265
  "epoch": 0.6491228070175439,
266
+ "grad_norm": 0.6763307332151739,
267
  "learning_rate": 4.925739315689991e-05,
268
+ "loss": 0.881,
269
  "step": 37
270
  },
271
  {
272
  "epoch": 0.6666666666666666,
273
+ "grad_norm": 0.7596612492978558,
274
  "learning_rate": 4.9185380486571595e-05,
275
+ "loss": 0.8883,
276
  "step": 38
277
  },
278
  {
279
  "epoch": 0.6842105263157895,
280
+ "grad_norm": 0.574360173449421,
281
  "learning_rate": 4.9110093502873476e-05,
282
+ "loss": 0.9078,
283
  "step": 39
284
  },
285
  {
286
  "epoch": 0.7017543859649122,
287
+ "grad_norm": 0.5045882171460956,
288
  "learning_rate": 4.9031542398457974e-05,
289
+ "loss": 0.8722,
290
  "step": 40
291
  },
292
  {
293
  "epoch": 0.7192982456140351,
294
+ "grad_norm": 0.6267476132219085,
295
  "learning_rate": 4.894973780788722e-05,
296
+ "loss": 0.883,
297
  "step": 41
298
  },
299
  {
300
  "epoch": 0.7368421052631579,
301
+ "grad_norm": 0.6021189923969766,
302
  "learning_rate": 4.88646908061933e-05,
303
+ "loss": 0.8144,
304
  "step": 42
305
  },
306
  {
307
  "epoch": 0.7543859649122807,
308
+ "grad_norm": 0.6014066609170303,
309
  "learning_rate": 4.877641290737884e-05,
310
+ "loss": 0.8496,
311
  "step": 43
312
  },
313
  {
314
  "epoch": 0.7719298245614035,
315
+ "grad_norm": 0.6014121657776419,
316
  "learning_rate": 4.868491606285823e-05,
317
+ "loss": 0.768,
318
  "step": 44
319
  },
320
  {
321
  "epoch": 0.7894736842105263,
322
+ "grad_norm": 0.5324268865416073,
323
  "learning_rate": 4.859021265983959e-05,
324
+ "loss": 0.8825,
325
  "step": 45
326
  },
327
  {
328
  "epoch": 0.8070175438596491,
329
+ "grad_norm": 0.6348338184686191,
330
  "learning_rate": 4.849231551964771e-05,
331
+ "loss": 0.8587,
332
  "step": 46
333
  },
334
  {
335
  "epoch": 0.8245614035087719,
336
+ "grad_norm": 0.6361465840640576,
337
  "learning_rate": 4.839123789598829e-05,
338
+ "loss": 0.868,
339
  "step": 47
340
  },
341
  {
342
  "epoch": 0.8421052631578947,
343
+ "grad_norm": 0.5806423652725062,
344
  "learning_rate": 4.828699347315356e-05,
345
+ "loss": 0.8445,
346
  "step": 48
347
  },
348
  {
349
  "epoch": 0.8596491228070176,
350
+ "grad_norm": 0.6650303211642944,
351
  "learning_rate": 4.817959636416969e-05,
352
+ "loss": 0.9439,
353
  "step": 49
354
  },
355
  {
356
  "epoch": 0.8771929824561403,
357
+ "grad_norm": 0.49618932027271617,
358
  "learning_rate": 4.806906110888606e-05,
359
+ "loss": 0.676,
360
  "step": 50
361
  },
362
  {
363
  "epoch": 0.8947368421052632,
364
+ "grad_norm": 0.7201128419875511,
365
  "learning_rate": 4.7955402672006854e-05,
366
+ "loss": 0.9129,
367
  "step": 51
368
  },
369
  {
370
  "epoch": 0.9122807017543859,
371
+ "grad_norm": 0.7979009332190922,
372
  "learning_rate": 4.783863644106502e-05,
373
+ "loss": 0.8844,
374
  "step": 52
375
  },
376
  {
377
  "epoch": 0.9298245614035088,
378
+ "grad_norm": 0.5780879587849941,
379
  "learning_rate": 4.771877822433911e-05,
380
+ "loss": 0.835,
381
  "step": 53
382
  },
383
  {
384
  "epoch": 0.9473684210526315,
385
+ "grad_norm": 0.5125413888902927,
386
  "learning_rate": 4.759584424871302e-05,
387
+ "loss": 0.7854,
388
  "step": 54
389
  },
390
  {
391
  "epoch": 0.9649122807017544,
392
+ "grad_norm": 0.6493310136858548,
393
  "learning_rate": 4.7469851157479177e-05,
394
+ "loss": 0.8246,
395
  "step": 55
396
  },
397
  {
398
  "epoch": 0.9824561403508771,
399
+ "grad_norm": 0.5420670379906682,
400
  "learning_rate": 4.734081600808531e-05,
401
+ "loss": 0.8357,
402
  "step": 56
403
  },
404
  {
405
  "epoch": 1.0,
406
+ "grad_norm": 0.8014271486806656,
407
  "learning_rate": 4.7208756269825104e-05,
408
+ "loss": 0.7012,
409
  "step": 57
410
  },
411
  {
412
  "epoch": 1.0175438596491229,
413
+ "grad_norm": 0.9216029966396811,
414
  "learning_rate": 4.707368982147318e-05,
415
+ "loss": 0.7618,
416
  "step": 58
417
  },
418
  {
419
  "epoch": 1.0350877192982457,
420
+ "grad_norm": 0.7324508540649931,
421
  "learning_rate": 4.693563494886455e-05,
422
+ "loss": 0.6381,
423
  "step": 59
424
  },
425
  {
426
  "epoch": 1.0526315789473684,
427
+ "grad_norm": 0.9404509582476525,
428
  "learning_rate": 4.679461034241906e-05,
429
+ "loss": 0.7367,
430
  "step": 60
431
  },
432
  {
433
  "epoch": 1.0701754385964912,
434
+ "grad_norm": 1.1079545865955729,
435
  "learning_rate": 4.665063509461097e-05,
436
+ "loss": 0.6252,
437
  "step": 61
438
  },
439
  {
440
  "epoch": 1.087719298245614,
441
+ "grad_norm": 0.8253491984345485,
442
  "learning_rate": 4.650372869738414e-05,
443
+ "loss": 0.6622,
444
  "step": 62
445
  },
446
  {
447
  "epoch": 1.1052631578947367,
448
+ "grad_norm": 1.237778560255967,
449
  "learning_rate": 4.6353911039513145e-05,
450
+ "loss": 0.7474,
451
  "step": 63
452
  },
453
  {
454
  "epoch": 1.1228070175438596,
455
+ "grad_norm": 0.8429832395594149,
456
  "learning_rate": 4.620120240391065e-05,
457
+ "loss": 0.6785,
458
  "step": 64
459
  },
460
  {
461
  "epoch": 1.1403508771929824,
462
+ "grad_norm": 0.7067163708835564,
463
  "learning_rate": 4.604562346488144e-05,
464
+ "loss": 0.633,
465
  "step": 65
466
  },
467
  {
468
  "epoch": 1.1578947368421053,
469
+ "grad_norm": 0.7599104494469864,
470
  "learning_rate": 4.588719528532342e-05,
471
+ "loss": 0.6462,
472
  "step": 66
473
  },
474
  {
475
  "epoch": 1.1754385964912282,
476
+ "grad_norm": 0.74600695035756,
477
  "learning_rate": 4.572593931387604e-05,
478
+ "loss": 0.6408,
479
  "step": 67
480
  },
481
  {
482
  "epoch": 1.1929824561403508,
483
+ "grad_norm": 0.8901782092713822,
484
  "learning_rate": 4.556187738201656e-05,
485
+ "loss": 0.68,
486
  "step": 68
487
  },
488
  {
489
  "epoch": 1.2105263157894737,
490
+ "grad_norm": 0.7884946794083679,
491
  "learning_rate": 4.539503170110431e-05,
492
+ "loss": 0.7179,
493
  "step": 69
494
  },
495
  {
496
  "epoch": 1.2280701754385965,
497
+ "grad_norm": 0.6924211614290692,
498
  "learning_rate": 4.522542485937369e-05,
499
+ "loss": 0.6849,
500
  "step": 70
501
  },
502
  {
503
  "epoch": 1.2456140350877192,
504
+ "grad_norm": 0.6629364599996329,
505
  "learning_rate": 4.50530798188761e-05,
506
+ "loss": 0.7385,
507
  "step": 71
508
  },
509
  {
510
  "epoch": 1.263157894736842,
511
+ "grad_norm": 1.155774350961545,
512
  "learning_rate": 4.48780199123712e-05,
513
+ "loss": 0.6942,
514
  "step": 72
515
  },
516
  {
517
  "epoch": 1.280701754385965,
518
+ "grad_norm": 0.8390095837968929,
519
  "learning_rate": 4.4700268840168045e-05,
520
+ "loss": 0.6789,
521
  "step": 73
522
  },
523
  {
524
  "epoch": 1.2982456140350878,
525
+ "grad_norm": 0.5703687799634394,
526
  "learning_rate": 4.4519850666916484e-05,
527
+ "loss": 0.6164,
528
  "step": 74
529
  },
530
  {
531
  "epoch": 1.3157894736842106,
532
+ "grad_norm": 0.6583590025049137,
533
  "learning_rate": 4.43367898183491e-05,
534
+ "loss": 0.7449,
535
  "step": 75
536
  },
537
  {
538
  "epoch": 1.3333333333333333,
539
+ "grad_norm": 0.5701968757608501,
540
  "learning_rate": 4.415111107797445e-05,
541
+ "loss": 0.6234,
542
  "step": 76
543
  },
544
  {
545
  "epoch": 1.3508771929824561,
546
+ "grad_norm": 0.5635406203198912,
547
  "learning_rate": 4.396283958372173e-05,
548
+ "loss": 0.6082,
549
  "step": 77
550
  },
551
  {
552
  "epoch": 1.368421052631579,
553
+ "grad_norm": 0.7103841851203655,
554
  "learning_rate": 4.377200082453749e-05,
555
+ "loss": 0.716,
556
  "step": 78
557
  },
558
  {
559
  "epoch": 1.3859649122807016,
560
+ "grad_norm": 0.7190614642338603,
561
  "learning_rate": 4.357862063693486e-05,
562
+ "loss": 0.7011,
563
  "step": 79
564
  },
565
  {
566
  "epoch": 1.4035087719298245,
567
+ "grad_norm": 0.6721203312978761,
568
  "learning_rate": 4.3382725201495723e-05,
569
+ "loss": 0.683,
570
  "step": 80
571
  },
572
  {
573
  "epoch": 1.4210526315789473,
574
+ "grad_norm": 0.8333526214828886,
575
  "learning_rate": 4.318434103932622e-05,
576
+ "loss": 0.679,
577
  "step": 81
578
  },
579
  {
580
  "epoch": 1.4385964912280702,
581
+ "grad_norm": 0.6238744952578675,
582
  "learning_rate": 4.2983495008466276e-05,
583
+ "loss": 0.663,
584
  "step": 82
585
  },
586
  {
587
  "epoch": 1.456140350877193,
588
+ "grad_norm": 0.6288416586111145,
589
  "learning_rate": 4.278021430025343e-05,
590
+ "loss": 0.5932,
591
  "step": 83
592
  },
593
  {
594
  "epoch": 1.4736842105263157,
595
+ "grad_norm": 0.5006163307702345,
596
  "learning_rate": 4.257452643564155e-05,
597
+ "loss": 0.5148,
598
  "step": 84
599
  },
600
  {
601
  "epoch": 1.4912280701754386,
602
+ "grad_norm": 0.6562209743705945,
603
  "learning_rate": 4.2366459261474933e-05,
604
+ "loss": 0.6796,
605
  "step": 85
606
  },
607
  {
608
  "epoch": 1.5087719298245614,
609
+ "grad_norm": 0.7234775007820414,
610
  "learning_rate": 4.215604094671835e-05,
611
+ "loss": 0.6544,
612
  "step": 86
613
  },
614
  {
615
  "epoch": 1.526315789473684,
616
+ "grad_norm": 0.6290459893449781,
617
  "learning_rate": 4.194329997864331e-05,
618
+ "loss": 0.8034,
619
  "step": 87
620
  },
621
  {
622
  "epoch": 1.543859649122807,
623
+ "grad_norm": 0.586369579542573,
624
  "learning_rate": 4.172826515897146e-05,
625
+ "loss": 0.6832,
626
  "step": 88
627
  },
628
  {
629
  "epoch": 1.5614035087719298,
630
+ "grad_norm": 0.5753053969606661,
631
  "learning_rate": 4.1510965599975196e-05,
632
+ "loss": 0.7196,
633
  "step": 89
634
  },
635
  {
636
  "epoch": 1.5789473684210527,
637
+ "grad_norm": 0.6527437493399515,
638
  "learning_rate": 4.129143072053638e-05,
639
+ "loss": 0.6268,
640
  "step": 90
641
  },
642
  {
643
  "epoch": 1.5964912280701755,
644
+ "grad_norm": 0.5347518989317783,
645
  "learning_rate": 4.1069690242163484e-05,
646
+ "loss": 0.66,
647
  "step": 91
648
  },
649
  {
650
  "epoch": 1.6140350877192984,
651
+ "grad_norm": 0.5466222360441246,
652
  "learning_rate": 4.0845774184967754e-05,
653
+ "loss": 0.6863,
654
  "step": 92
655
  },
656
  {
657
  "epoch": 1.631578947368421,
658
+ "grad_norm": 0.5287265061280089,
659
  "learning_rate": 4.0619712863599e-05,
660
+ "loss": 0.6054,
661
  "step": 93
662
  },
663
  {
664
  "epoch": 1.6491228070175439,
665
+ "grad_norm": 0.5020449411728808,
666
  "learning_rate": 4.039153688314145e-05,
667
+ "loss": 0.6106,
668
  "step": 94
669
  },
670
  {
671
  "epoch": 1.6666666666666665,
672
+ "grad_norm": 0.5326278815255991,
673
  "learning_rate": 4.0161277134970345e-05,
674
+ "loss": 0.7097,
675
  "step": 95
676
  },
677
  {
678
  "epoch": 1.6842105263157894,
679
+ "grad_norm": 0.5377095024358528,
680
  "learning_rate": 3.9928964792569655e-05,
681
+ "loss": 0.6227,
682
  "step": 96
683
  },
684
  {
685
  "epoch": 1.7017543859649122,
686
+ "grad_norm": 0.576841931441445,
687
  "learning_rate": 3.969463130731183e-05,
688
+ "loss": 0.7862,
689
  "step": 97
690
  },
691
  {
692
  "epoch": 1.719298245614035,
693
+ "grad_norm": 0.6066486813440297,
694
  "learning_rate": 3.945830840419966e-05,
695
+ "loss": 0.6729,
696
  "step": 98
697
  },
698
  {
699
  "epoch": 1.736842105263158,
700
+ "grad_norm": 0.5159053057386966,
701
  "learning_rate": 3.9220028077571295e-05,
702
+ "loss": 0.7059,
703
  "step": 99
704
  },
705
  {
706
  "epoch": 1.7543859649122808,
707
+ "grad_norm": 0.4950698707861886,
708
  "learning_rate": 3.897982258676867e-05,
709
+ "loss": 0.6351,
710
  "step": 100
711
  },
712
  {
713
  "epoch": 1.7719298245614035,
714
+ "grad_norm": 0.5420056949722063,
715
  "learning_rate": 3.873772445177015e-05,
716
+ "loss": 0.711,
717
  "step": 101
718
  },
719
  {
720
  "epoch": 1.7894736842105263,
721
+ "grad_norm": 0.4564353605846405,
722
  "learning_rate": 3.8493766448787825e-05,
723
+ "loss": 0.7022,
724
  "step": 102
725
  },
726
  {
727
  "epoch": 1.807017543859649,
728
+ "grad_norm": 0.5625326599444068,
729
  "learning_rate": 3.824798160583012e-05,
730
+ "loss": 0.7015,
731
  "step": 103
732
  },
733
  {
734
  "epoch": 1.8245614035087718,
735
+ "grad_norm": 0.48056030105986436,
736
  "learning_rate": 3.8000403198230387e-05,
737
+ "loss": 0.5815,
738
  "step": 104
739
  },
740
  {
741
  "epoch": 1.8421052631578947,
742
+ "grad_norm": 0.453650733165799,
743
  "learning_rate": 3.775106474414188e-05,
744
+ "loss": 0.5505,
745
  "step": 105
746
  },
747
  {
748
  "epoch": 1.8596491228070176,
749
+ "grad_norm": 0.9587534224527449,
750
  "learning_rate": 3.7500000000000003e-05,
751
+ "loss": 0.7681,
752
  "step": 106
753
  },
754
  {
755
  "epoch": 1.8771929824561404,
756
+ "grad_norm": 0.6628313082859163,
757
  "learning_rate": 3.7247242955952175e-05,
758
+ "loss": 0.6832,
759
  "step": 107
760
  },
761
  {
762
  "epoch": 1.8947368421052633,
763
+ "grad_norm": 0.5627752640940548,
764
  "learning_rate": 3.699282783125616e-05,
765
+ "loss": 0.6747,
766
  "step": 108
767
  },
768
  {
769
  "epoch": 1.912280701754386,
770
+ "grad_norm": 0.5379495312193924,
771
  "learning_rate": 3.673678906964727e-05,
772
+ "loss": 0.5904,
773
  "step": 109
774
  },
775
  {
776
  "epoch": 1.9298245614035088,
777
+ "grad_norm": 0.5290950361826221,
778
  "learning_rate": 3.6479161334675296e-05,
779
+ "loss": 0.7161,
780
  "step": 110
781
  },
782
  {
783
  "epoch": 1.9473684210526314,
784
+ "grad_norm": 0.6329223546784813,
785
  "learning_rate": 3.621997950501156e-05,
786
+ "loss": 0.7186,
787
  "step": 111
788
  },
789
  {
790
  "epoch": 1.9649122807017543,
791
+ "grad_norm": 0.722712664381171,
792
  "learning_rate": 3.5959278669726935e-05,
793
+ "loss": 0.6872,
794
  "step": 112
795
  },
796
  {
797
  "epoch": 1.9824561403508771,
798
+ "grad_norm": 0.5556537835599905,
799
  "learning_rate": 3.569709412354136e-05,
800
+ "loss": 0.7653,
801
  "step": 113
802
  },
803
  {
804
  "epoch": 2.0,
805
+ "grad_norm": 0.7972949317622412,
806
  "learning_rate": 3.543346136204545e-05,
807
+ "loss": 0.5974,
808
  "step": 114
809
  }
810
  ],
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:14ce14cb5b4d1f6b7378f3a0d59f90353c316de79d943a6fb969a0b7df2b441c
3
  size 8081
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b09193bae298a0a841c37ce61432d61767b4d633b9680c59539c690bef78161
3
  size 8081