yueqis commited on
Commit
cdc6d1a
·
verified ·
1 Parent(s): e3ab33e

Upload trainer_state.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. trainer_state.json +1170 -0
trainer_state.json ADDED
@@ -0,0 +1,1170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.9984496124031008,
6
+ "eval_steps": 500,
7
+ "global_step": 161,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.006201550387596899,
14
+ "grad_norm": 15.26399338835659,
15
+ "learning_rate": 0.0,
16
+ "loss": 1.9626,
17
+ "step": 1
18
+ },
19
+ {
20
+ "epoch": 0.012403100775193798,
21
+ "grad_norm": 14.720768346647702,
22
+ "learning_rate": 1.111111111111111e-06,
23
+ "loss": 1.9459,
24
+ "step": 2
25
+ },
26
+ {
27
+ "epoch": 0.018604651162790697,
28
+ "grad_norm": 15.485169481168326,
29
+ "learning_rate": 2.222222222222222e-06,
30
+ "loss": 2.3093,
31
+ "step": 3
32
+ },
33
+ {
34
+ "epoch": 0.024806201550387597,
35
+ "grad_norm": 15.238736100868511,
36
+ "learning_rate": 3.3333333333333333e-06,
37
+ "loss": 2.0299,
38
+ "step": 4
39
+ },
40
+ {
41
+ "epoch": 0.031007751937984496,
42
+ "grad_norm": 10.917974882633493,
43
+ "learning_rate": 4.444444444444444e-06,
44
+ "loss": 1.9934,
45
+ "step": 5
46
+ },
47
+ {
48
+ "epoch": 0.037209302325581395,
49
+ "grad_norm": 3.5887710787919214,
50
+ "learning_rate": 5.555555555555557e-06,
51
+ "loss": 1.0246,
52
+ "step": 6
53
+ },
54
+ {
55
+ "epoch": 0.043410852713178294,
56
+ "grad_norm": 6.2370114556413,
57
+ "learning_rate": 6.666666666666667e-06,
58
+ "loss": 1.5176,
59
+ "step": 7
60
+ },
61
+ {
62
+ "epoch": 0.04961240310077519,
63
+ "grad_norm": 5.919162169895783,
64
+ "learning_rate": 7.77777777777778e-06,
65
+ "loss": 1.5474,
66
+ "step": 8
67
+ },
68
+ {
69
+ "epoch": 0.05581395348837209,
70
+ "grad_norm": 4.325467650355213,
71
+ "learning_rate": 8.888888888888888e-06,
72
+ "loss": 1.2359,
73
+ "step": 9
74
+ },
75
+ {
76
+ "epoch": 0.06201550387596899,
77
+ "grad_norm": 3.619075212455386,
78
+ "learning_rate": 1e-05,
79
+ "loss": 0.9838,
80
+ "step": 10
81
+ },
82
+ {
83
+ "epoch": 0.06821705426356589,
84
+ "grad_norm": 3.471405941940013,
85
+ "learning_rate": 9.998932083939657e-06,
86
+ "loss": 0.9981,
87
+ "step": 11
88
+ },
89
+ {
90
+ "epoch": 0.07441860465116279,
91
+ "grad_norm": 2.7880671807653483,
92
+ "learning_rate": 9.995728791936505e-06,
93
+ "loss": 0.8744,
94
+ "step": 12
95
+ },
96
+ {
97
+ "epoch": 0.08062015503875969,
98
+ "grad_norm": 6.034968083519955,
99
+ "learning_rate": 9.990391492329341e-06,
100
+ "loss": 0.7676,
101
+ "step": 13
102
+ },
103
+ {
104
+ "epoch": 0.08682170542635659,
105
+ "grad_norm": 1.9803400973575944,
106
+ "learning_rate": 9.98292246503335e-06,
107
+ "loss": 0.5414,
108
+ "step": 14
109
+ },
110
+ {
111
+ "epoch": 0.09302325581395349,
112
+ "grad_norm": 1.234736717659378,
113
+ "learning_rate": 9.973324900566214e-06,
114
+ "loss": 0.53,
115
+ "step": 15
116
+ },
117
+ {
118
+ "epoch": 0.09922480620155039,
119
+ "grad_norm": 1.0213862287647824,
120
+ "learning_rate": 9.961602898685225e-06,
121
+ "loss": 0.5062,
122
+ "step": 16
123
+ },
124
+ {
125
+ "epoch": 0.10542635658914729,
126
+ "grad_norm": 1.1013669003760953,
127
+ "learning_rate": 9.947761466636014e-06,
128
+ "loss": 0.557,
129
+ "step": 17
130
+ },
131
+ {
132
+ "epoch": 0.11162790697674418,
133
+ "grad_norm": 0.7019092049936988,
134
+ "learning_rate": 9.931806517013612e-06,
135
+ "loss": 0.4931,
136
+ "step": 18
137
+ },
138
+ {
139
+ "epoch": 0.11782945736434108,
140
+ "grad_norm": 0.6803712338561847,
141
+ "learning_rate": 9.913744865236798e-06,
142
+ "loss": 0.493,
143
+ "step": 19
144
+ },
145
+ {
146
+ "epoch": 0.12403100775193798,
147
+ "grad_norm": 3.1222946795280255,
148
+ "learning_rate": 9.893584226636773e-06,
149
+ "loss": 0.5671,
150
+ "step": 20
151
+ },
152
+ {
153
+ "epoch": 0.13023255813953488,
154
+ "grad_norm": 0.4533299829843851,
155
+ "learning_rate": 9.871333213161438e-06,
156
+ "loss": 0.3441,
157
+ "step": 21
158
+ },
159
+ {
160
+ "epoch": 0.13643410852713178,
161
+ "grad_norm": 0.449537089022575,
162
+ "learning_rate": 9.847001329696653e-06,
163
+ "loss": 0.3215,
164
+ "step": 22
165
+ },
166
+ {
167
+ "epoch": 0.14263565891472868,
168
+ "grad_norm": 0.6774019887238769,
169
+ "learning_rate": 9.820598970006068e-06,
170
+ "loss": 0.4402,
171
+ "step": 23
172
+ },
173
+ {
174
+ "epoch": 0.14883720930232558,
175
+ "grad_norm": 0.45963497537736786,
176
+ "learning_rate": 9.792137412291265e-06,
177
+ "loss": 0.4022,
178
+ "step": 24
179
+ },
180
+ {
181
+ "epoch": 0.15503875968992248,
182
+ "grad_norm": 0.4722916828215964,
183
+ "learning_rate": 9.761628814374074e-06,
184
+ "loss": 0.4308,
185
+ "step": 25
186
+ },
187
+ {
188
+ "epoch": 0.16124031007751938,
189
+ "grad_norm": 0.6226919889929532,
190
+ "learning_rate": 9.729086208503174e-06,
191
+ "loss": 0.4193,
192
+ "step": 26
193
+ },
194
+ {
195
+ "epoch": 0.16744186046511628,
196
+ "grad_norm": 0.3944344161765665,
197
+ "learning_rate": 9.694523495787149e-06,
198
+ "loss": 0.3406,
199
+ "step": 27
200
+ },
201
+ {
202
+ "epoch": 0.17364341085271318,
203
+ "grad_norm": 0.6421115422344923,
204
+ "learning_rate": 9.657955440256396e-06,
205
+ "loss": 0.3788,
206
+ "step": 28
207
+ },
208
+ {
209
+ "epoch": 0.17984496124031008,
210
+ "grad_norm": 0.4023252512533979,
211
+ "learning_rate": 9.619397662556434e-06,
212
+ "loss": 0.3484,
213
+ "step": 29
214
+ },
215
+ {
216
+ "epoch": 0.18604651162790697,
217
+ "grad_norm": 0.5280821694606184,
218
+ "learning_rate": 9.578866633275289e-06,
219
+ "loss": 0.3579,
220
+ "step": 30
221
+ },
222
+ {
223
+ "epoch": 0.19224806201550387,
224
+ "grad_norm": 0.3639394212633886,
225
+ "learning_rate": 9.536379665907801e-06,
226
+ "loss": 0.3281,
227
+ "step": 31
228
+ },
229
+ {
230
+ "epoch": 0.19844961240310077,
231
+ "grad_norm": 0.3343773044134941,
232
+ "learning_rate": 9.491954909459895e-06,
233
+ "loss": 0.3163,
234
+ "step": 32
235
+ },
236
+ {
237
+ "epoch": 0.20465116279069767,
238
+ "grad_norm": 0.6336122974597818,
239
+ "learning_rate": 9.445611340695926e-06,
240
+ "loss": 0.3736,
241
+ "step": 33
242
+ },
243
+ {
244
+ "epoch": 0.21085271317829457,
245
+ "grad_norm": 0.29713001995988114,
246
+ "learning_rate": 9.397368756032445e-06,
247
+ "loss": 0.3299,
248
+ "step": 34
249
+ },
250
+ {
251
+ "epoch": 0.21705426356589147,
252
+ "grad_norm": 0.3516643576671143,
253
+ "learning_rate": 9.347247763081834e-06,
254
+ "loss": 0.2795,
255
+ "step": 35
256
+ },
257
+ {
258
+ "epoch": 0.22325581395348837,
259
+ "grad_norm": 0.46593544409154286,
260
+ "learning_rate": 9.295269771849426e-06,
261
+ "loss": 0.3805,
262
+ "step": 36
263
+ },
264
+ {
265
+ "epoch": 0.22945736434108527,
266
+ "grad_norm": 1.385652999055083,
267
+ "learning_rate": 9.241456985587868e-06,
268
+ "loss": 0.4296,
269
+ "step": 37
270
+ },
271
+ {
272
+ "epoch": 0.23565891472868217,
273
+ "grad_norm": 0.45985433788020086,
274
+ "learning_rate": 9.185832391312644e-06,
275
+ "loss": 0.3928,
276
+ "step": 38
277
+ },
278
+ {
279
+ "epoch": 0.24186046511627907,
280
+ "grad_norm": 0.719866169139414,
281
+ "learning_rate": 9.12841974998278e-06,
282
+ "loss": 0.3674,
283
+ "step": 39
284
+ },
285
+ {
286
+ "epoch": 0.24806201550387597,
287
+ "grad_norm": 0.4245555272468952,
288
+ "learning_rate": 9.069243586350976e-06,
289
+ "loss": 0.3952,
290
+ "step": 40
291
+ },
292
+ {
293
+ "epoch": 0.25426356589147286,
294
+ "grad_norm": 0.32715050071028695,
295
+ "learning_rate": 9.008329178487442e-06,
296
+ "loss": 0.3118,
297
+ "step": 41
298
+ },
299
+ {
300
+ "epoch": 0.26046511627906976,
301
+ "grad_norm": 0.323913785807112,
302
+ "learning_rate": 8.94570254698197e-06,
303
+ "loss": 0.2915,
304
+ "step": 42
305
+ },
306
+ {
307
+ "epoch": 0.26666666666666666,
308
+ "grad_norm": 0.44726444195044723,
309
+ "learning_rate": 8.881390443828788e-06,
310
+ "loss": 0.3854,
311
+ "step": 43
312
+ },
313
+ {
314
+ "epoch": 0.27286821705426356,
315
+ "grad_norm": 0.66664094212195,
316
+ "learning_rate": 8.815420340999034e-06,
317
+ "loss": 0.3049,
318
+ "step": 44
319
+ },
320
+ {
321
+ "epoch": 0.27906976744186046,
322
+ "grad_norm": 0.4249980060595529,
323
+ "learning_rate": 8.747820418705632e-06,
324
+ "loss": 0.3438,
325
+ "step": 45
326
+ },
327
+ {
328
+ "epoch": 0.28527131782945736,
329
+ "grad_norm": 0.5176821224911877,
330
+ "learning_rate": 8.67861955336566e-06,
331
+ "loss": 0.4039,
332
+ "step": 46
333
+ },
334
+ {
335
+ "epoch": 0.29147286821705426,
336
+ "grad_norm": 0.28158982412253863,
337
+ "learning_rate": 8.607847305265312e-06,
338
+ "loss": 0.3504,
339
+ "step": 47
340
+ },
341
+ {
342
+ "epoch": 0.29767441860465116,
343
+ "grad_norm": 0.5840042318469005,
344
+ "learning_rate": 8.535533905932739e-06,
345
+ "loss": 0.3256,
346
+ "step": 48
347
+ },
348
+ {
349
+ "epoch": 0.30387596899224806,
350
+ "grad_norm": 0.31414108766842425,
351
+ "learning_rate": 8.461710245224149e-06,
352
+ "loss": 0.3202,
353
+ "step": 49
354
+ },
355
+ {
356
+ "epoch": 0.31007751937984496,
357
+ "grad_norm": 1.2797772790133255,
358
+ "learning_rate": 8.386407858128707e-06,
359
+ "loss": 0.3636,
360
+ "step": 50
361
+ },
362
+ {
363
+ "epoch": 0.31627906976744186,
364
+ "grad_norm": 0.3482350388750869,
365
+ "learning_rate": 8.309658911297833e-06,
366
+ "loss": 0.2954,
367
+ "step": 51
368
+ },
369
+ {
370
+ "epoch": 0.32248062015503876,
371
+ "grad_norm": 0.42356575149686077,
372
+ "learning_rate": 8.231496189304704e-06,
373
+ "loss": 0.3755,
374
+ "step": 52
375
+ },
376
+ {
377
+ "epoch": 0.32868217054263565,
378
+ "grad_norm": 0.37022827136720965,
379
+ "learning_rate": 8.151953080639777e-06,
380
+ "loss": 0.3133,
381
+ "step": 53
382
+ },
383
+ {
384
+ "epoch": 0.33488372093023255,
385
+ "grad_norm": 0.45808972723840075,
386
+ "learning_rate": 8.071063563448341e-06,
387
+ "loss": 0.3131,
388
+ "step": 54
389
+ },
390
+ {
391
+ "epoch": 0.34108527131782945,
392
+ "grad_norm": 0.3131249221444263,
393
+ "learning_rate": 7.988862191016204e-06,
394
+ "loss": 0.3262,
395
+ "step": 55
396
+ },
397
+ {
398
+ "epoch": 0.34728682170542635,
399
+ "grad_norm": 0.5028658389613722,
400
+ "learning_rate": 7.905384077009693e-06,
401
+ "loss": 0.2962,
402
+ "step": 56
403
+ },
404
+ {
405
+ "epoch": 0.35348837209302325,
406
+ "grad_norm": 0.3765836806930317,
407
+ "learning_rate": 7.820664880476257e-06,
408
+ "loss": 0.3573,
409
+ "step": 57
410
+ },
411
+ {
412
+ "epoch": 0.35968992248062015,
413
+ "grad_norm": 1.3467161045716638,
414
+ "learning_rate": 7.734740790612137e-06,
415
+ "loss": 0.3903,
416
+ "step": 58
417
+ },
418
+ {
419
+ "epoch": 0.36589147286821705,
420
+ "grad_norm": 0.32717489874399847,
421
+ "learning_rate": 7.647648511303545e-06,
422
+ "loss": 0.3114,
423
+ "step": 59
424
+ },
425
+ {
426
+ "epoch": 0.37209302325581395,
427
+ "grad_norm": 0.3265443857577057,
428
+ "learning_rate": 7.559425245448006e-06,
429
+ "loss": 0.3298,
430
+ "step": 60
431
+ },
432
+ {
433
+ "epoch": 0.37829457364341085,
434
+ "grad_norm": 0.3241255915038842,
435
+ "learning_rate": 7.470108679062521e-06,
436
+ "loss": 0.236,
437
+ "step": 61
438
+ },
439
+ {
440
+ "epoch": 0.38449612403100775,
441
+ "grad_norm": 0.3991561591891734,
442
+ "learning_rate": 7.379736965185369e-06,
443
+ "loss": 0.3525,
444
+ "step": 62
445
+ },
446
+ {
447
+ "epoch": 0.39069767441860465,
448
+ "grad_norm": 0.32423623161134224,
449
+ "learning_rate": 7.288348707578409e-06,
450
+ "loss": 0.3046,
451
+ "step": 63
452
+ },
453
+ {
454
+ "epoch": 0.39689922480620154,
455
+ "grad_norm": 0.3171873001618541,
456
+ "learning_rate": 7.195982944236853e-06,
457
+ "loss": 0.3594,
458
+ "step": 64
459
+ },
460
+ {
461
+ "epoch": 0.40310077519379844,
462
+ "grad_norm": 0.28143210002099517,
463
+ "learning_rate": 7.102679130713538e-06,
464
+ "loss": 0.2633,
465
+ "step": 65
466
+ },
467
+ {
468
+ "epoch": 0.40930232558139534,
469
+ "grad_norm": 0.2641885107385758,
470
+ "learning_rate": 7.008477123264849e-06,
471
+ "loss": 0.274,
472
+ "step": 66
473
+ },
474
+ {
475
+ "epoch": 0.41550387596899224,
476
+ "grad_norm": 0.27510115989973793,
477
+ "learning_rate": 6.913417161825449e-06,
478
+ "loss": 0.2948,
479
+ "step": 67
480
+ },
481
+ {
482
+ "epoch": 0.42170542635658914,
483
+ "grad_norm": 0.2818463816101008,
484
+ "learning_rate": 6.817539852819149e-06,
485
+ "loss": 0.2716,
486
+ "step": 68
487
+ },
488
+ {
489
+ "epoch": 0.42790697674418604,
490
+ "grad_norm": 0.296666971572694,
491
+ "learning_rate": 6.720886151813194e-06,
492
+ "loss": 0.295,
493
+ "step": 69
494
+ },
495
+ {
496
+ "epoch": 0.43410852713178294,
497
+ "grad_norm": 1.9121660637037805,
498
+ "learning_rate": 6.6234973460234184e-06,
499
+ "loss": 0.3237,
500
+ "step": 70
501
+ },
502
+ {
503
+ "epoch": 0.44031007751937984,
504
+ "grad_norm": 0.38478679027888313,
505
+ "learning_rate": 6.525415036677745e-06,
506
+ "loss": 0.3505,
507
+ "step": 71
508
+ },
509
+ {
510
+ "epoch": 0.44651162790697674,
511
+ "grad_norm": 0.2742653943674723,
512
+ "learning_rate": 6.426681121245527e-06,
513
+ "loss": 0.3313,
514
+ "step": 72
515
+ },
516
+ {
517
+ "epoch": 0.45271317829457364,
518
+ "grad_norm": 0.3058386741176561,
519
+ "learning_rate": 6.327337775540362e-06,
520
+ "loss": 0.2985,
521
+ "step": 73
522
+ },
523
+ {
524
+ "epoch": 0.45891472868217054,
525
+ "grad_norm": 0.29164203779704884,
526
+ "learning_rate": 6.227427435703997e-06,
527
+ "loss": 0.2924,
528
+ "step": 74
529
+ },
530
+ {
531
+ "epoch": 0.46511627906976744,
532
+ "grad_norm": 0.31050749736435646,
533
+ "learning_rate": 6.126992780079032e-06,
534
+ "loss": 0.2931,
535
+ "step": 75
536
+ },
537
+ {
538
+ "epoch": 0.47131782945736433,
539
+ "grad_norm": 0.2794828298595081,
540
+ "learning_rate": 6.026076710978172e-06,
541
+ "loss": 0.2933,
542
+ "step": 76
543
+ },
544
+ {
545
+ "epoch": 0.47751937984496123,
546
+ "grad_norm": 0.28252753496671273,
547
+ "learning_rate": 5.924722336357793e-06,
548
+ "loss": 0.2703,
549
+ "step": 77
550
+ },
551
+ {
552
+ "epoch": 0.48372093023255813,
553
+ "grad_norm": 0.3272997547053633,
554
+ "learning_rate": 5.82297295140367e-06,
555
+ "loss": 0.3266,
556
+ "step": 78
557
+ },
558
+ {
559
+ "epoch": 0.48992248062015503,
560
+ "grad_norm": 0.26499070641459116,
561
+ "learning_rate": 5.720872020036734e-06,
562
+ "loss": 0.3111,
563
+ "step": 79
564
+ },
565
+ {
566
+ "epoch": 0.49612403100775193,
567
+ "grad_norm": 0.28778684221197826,
568
+ "learning_rate": 5.61846315634674e-06,
569
+ "loss": 0.2942,
570
+ "step": 80
571
+ },
572
+ {
573
+ "epoch": 0.5023255813953489,
574
+ "grad_norm": 0.37204267144771097,
575
+ "learning_rate": 5.515790105961785e-06,
576
+ "loss": 0.3478,
577
+ "step": 81
578
+ },
579
+ {
580
+ "epoch": 0.5085271317829457,
581
+ "grad_norm": 0.2847653373131156,
582
+ "learning_rate": 5.412896727361663e-06,
583
+ "loss": 0.3024,
584
+ "step": 82
585
+ },
586
+ {
587
+ "epoch": 0.5147286821705427,
588
+ "grad_norm": 0.3322812245008778,
589
+ "learning_rate": 5.309826973142974e-06,
590
+ "loss": 0.2983,
591
+ "step": 83
592
+ },
593
+ {
594
+ "epoch": 0.5209302325581395,
595
+ "grad_norm": 0.356939865277603,
596
+ "learning_rate": 5.206624871244066e-06,
597
+ "loss": 0.3077,
598
+ "step": 84
599
+ },
600
+ {
601
+ "epoch": 0.5271317829457365,
602
+ "grad_norm": 0.2732990191138046,
603
+ "learning_rate": 5.103334506137773e-06,
604
+ "loss": 0.2717,
605
+ "step": 85
606
+ },
607
+ {
608
+ "epoch": 0.5333333333333333,
609
+ "grad_norm": 0.3344747136592173,
610
+ "learning_rate": 5e-06,
611
+ "loss": 0.2807,
612
+ "step": 86
613
+ },
614
+ {
615
+ "epoch": 0.5395348837209303,
616
+ "grad_norm": 0.3395777579574581,
617
+ "learning_rate": 4.89666549386223e-06,
618
+ "loss": 0.3519,
619
+ "step": 87
620
+ },
621
+ {
622
+ "epoch": 0.5457364341085271,
623
+ "grad_norm": 0.37507314187909735,
624
+ "learning_rate": 4.793375128755934e-06,
625
+ "loss": 0.32,
626
+ "step": 88
627
+ },
628
+ {
629
+ "epoch": 0.5519379844961241,
630
+ "grad_norm": 0.3024187395636649,
631
+ "learning_rate": 4.690173026857028e-06,
632
+ "loss": 0.2461,
633
+ "step": 89
634
+ },
635
+ {
636
+ "epoch": 0.5581395348837209,
637
+ "grad_norm": 0.2754899764445255,
638
+ "learning_rate": 4.587103272638339e-06,
639
+ "loss": 0.2991,
640
+ "step": 90
641
+ },
642
+ {
643
+ "epoch": 0.5643410852713179,
644
+ "grad_norm": 0.34895281559601965,
645
+ "learning_rate": 4.4842098940382155e-06,
646
+ "loss": 0.3146,
647
+ "step": 91
648
+ },
649
+ {
650
+ "epoch": 0.5705426356589147,
651
+ "grad_norm": 0.29391978438245,
652
+ "learning_rate": 4.381536843653262e-06,
653
+ "loss": 0.3185,
654
+ "step": 92
655
+ },
656
+ {
657
+ "epoch": 0.5767441860465117,
658
+ "grad_norm": 0.27690222515112517,
659
+ "learning_rate": 4.279127979963266e-06,
660
+ "loss": 0.2876,
661
+ "step": 93
662
+ },
663
+ {
664
+ "epoch": 0.5829457364341085,
665
+ "grad_norm": 0.3201838307069794,
666
+ "learning_rate": 4.17702704859633e-06,
667
+ "loss": 0.2929,
668
+ "step": 94
669
+ },
670
+ {
671
+ "epoch": 0.5891472868217055,
672
+ "grad_norm": 1.3427423380380996,
673
+ "learning_rate": 4.075277663642208e-06,
674
+ "loss": 0.2659,
675
+ "step": 95
676
+ },
677
+ {
678
+ "epoch": 0.5953488372093023,
679
+ "grad_norm": 0.30047579019920523,
680
+ "learning_rate": 3.973923289021829e-06,
681
+ "loss": 0.2735,
682
+ "step": 96
683
+ },
684
+ {
685
+ "epoch": 0.6015503875968993,
686
+ "grad_norm": 0.2742842789808185,
687
+ "learning_rate": 3.8730072199209705e-06,
688
+ "loss": 0.2531,
689
+ "step": 97
690
+ },
691
+ {
692
+ "epoch": 0.6077519379844961,
693
+ "grad_norm": 0.2682447160880254,
694
+ "learning_rate": 3.7725725642960047e-06,
695
+ "loss": 0.2706,
696
+ "step": 98
697
+ },
698
+ {
699
+ "epoch": 0.6139534883720931,
700
+ "grad_norm": 0.2500514931866793,
701
+ "learning_rate": 3.67266222445964e-06,
702
+ "loss": 0.262,
703
+ "step": 99
704
+ },
705
+ {
706
+ "epoch": 0.6201550387596899,
707
+ "grad_norm": 0.25293752189367336,
708
+ "learning_rate": 3.573318878754475e-06,
709
+ "loss": 0.2597,
710
+ "step": 100
711
+ },
712
+ {
713
+ "epoch": 0.6263565891472869,
714
+ "grad_norm": 0.445308919718186,
715
+ "learning_rate": 3.4745849633222566e-06,
716
+ "loss": 0.2683,
717
+ "step": 101
718
+ },
719
+ {
720
+ "epoch": 0.6325581395348837,
721
+ "grad_norm": 0.26411175014036536,
722
+ "learning_rate": 3.3765026539765832e-06,
723
+ "loss": 0.2843,
724
+ "step": 102
725
+ },
726
+ {
727
+ "epoch": 0.6387596899224807,
728
+ "grad_norm": 0.4447506815447959,
729
+ "learning_rate": 3.2791138481868084e-06,
730
+ "loss": 0.3165,
731
+ "step": 103
732
+ },
733
+ {
734
+ "epoch": 0.6449612403100775,
735
+ "grad_norm": 0.6788210803412132,
736
+ "learning_rate": 3.1824601471808504e-06,
737
+ "loss": 0.3379,
738
+ "step": 104
739
+ },
740
+ {
741
+ "epoch": 0.6511627906976745,
742
+ "grad_norm": 0.2756366217165372,
743
+ "learning_rate": 3.0865828381745515e-06,
744
+ "loss": 0.3209,
745
+ "step": 105
746
+ },
747
+ {
748
+ "epoch": 0.6573643410852713,
749
+ "grad_norm": 0.3006265900853834,
750
+ "learning_rate": 2.991522876735154e-06,
751
+ "loss": 0.3301,
752
+ "step": 106
753
+ },
754
+ {
755
+ "epoch": 0.6635658914728683,
756
+ "grad_norm": 0.48771082441163705,
757
+ "learning_rate": 2.8973208692864623e-06,
758
+ "loss": 0.259,
759
+ "step": 107
760
+ },
761
+ {
762
+ "epoch": 0.6697674418604651,
763
+ "grad_norm": 0.2860117833703043,
764
+ "learning_rate": 2.804017055763149e-06,
765
+ "loss": 0.299,
766
+ "step": 108
767
+ },
768
+ {
769
+ "epoch": 0.6759689922480621,
770
+ "grad_norm": 0.32363963719457234,
771
+ "learning_rate": 2.711651292421593e-06,
772
+ "loss": 0.2555,
773
+ "step": 109
774
+ },
775
+ {
776
+ "epoch": 0.6821705426356589,
777
+ "grad_norm": 0.2850162950975944,
778
+ "learning_rate": 2.6202630348146323e-06,
779
+ "loss": 0.316,
780
+ "step": 110
781
+ },
782
+ {
783
+ "epoch": 0.6883720930232559,
784
+ "grad_norm": 0.2762800647193149,
785
+ "learning_rate": 2.529891320937481e-06,
786
+ "loss": 0.2523,
787
+ "step": 111
788
+ },
789
+ {
790
+ "epoch": 0.6945736434108527,
791
+ "grad_norm": 0.26856974090502583,
792
+ "learning_rate": 2.4405747545519966e-06,
793
+ "loss": 0.2761,
794
+ "step": 112
795
+ },
796
+ {
797
+ "epoch": 0.7007751937984497,
798
+ "grad_norm": 0.2490891936662437,
799
+ "learning_rate": 2.352351488696457e-06,
800
+ "loss": 0.2575,
801
+ "step": 113
802
+ },
803
+ {
804
+ "epoch": 0.7069767441860465,
805
+ "grad_norm": 0.26341989498270973,
806
+ "learning_rate": 2.265259209387867e-06,
807
+ "loss": 0.2249,
808
+ "step": 114
809
+ },
810
+ {
811
+ "epoch": 0.7131782945736435,
812
+ "grad_norm": 0.27360549708121684,
813
+ "learning_rate": 2.179335119523745e-06,
814
+ "loss": 0.2697,
815
+ "step": 115
816
+ },
817
+ {
818
+ "epoch": 0.7193798449612403,
819
+ "grad_norm": 0.25823370272340357,
820
+ "learning_rate": 2.094615922990309e-06,
821
+ "loss": 0.2856,
822
+ "step": 116
823
+ },
824
+ {
825
+ "epoch": 0.7255813953488373,
826
+ "grad_norm": 0.2861381618206921,
827
+ "learning_rate": 2.0111378089837958e-06,
828
+ "loss": 0.2735,
829
+ "step": 117
830
+ },
831
+ {
832
+ "epoch": 0.7317829457364341,
833
+ "grad_norm": 0.2644689705355804,
834
+ "learning_rate": 1.928936436551661e-06,
835
+ "loss": 0.302,
836
+ "step": 118
837
+ },
838
+ {
839
+ "epoch": 0.737984496124031,
840
+ "grad_norm": 0.2590429498790151,
841
+ "learning_rate": 1.848046919360225e-06,
842
+ "loss": 0.2951,
843
+ "step": 119
844
+ },
845
+ {
846
+ "epoch": 0.7441860465116279,
847
+ "grad_norm": 0.2706830740807015,
848
+ "learning_rate": 1.7685038106952952e-06,
849
+ "loss": 0.2236,
850
+ "step": 120
851
+ },
852
+ {
853
+ "epoch": 0.7503875968992249,
854
+ "grad_norm": 0.3323189304107019,
855
+ "learning_rate": 1.6903410887021676e-06,
856
+ "loss": 0.2847,
857
+ "step": 121
858
+ },
859
+ {
860
+ "epoch": 0.7565891472868217,
861
+ "grad_norm": 0.2747309759947305,
862
+ "learning_rate": 1.6135921418712959e-06,
863
+ "loss": 0.3403,
864
+ "step": 122
865
+ },
866
+ {
867
+ "epoch": 0.7627906976744186,
868
+ "grad_norm": 0.24338533093448128,
869
+ "learning_rate": 1.5382897547758513e-06,
870
+ "loss": 0.2742,
871
+ "step": 123
872
+ },
873
+ {
874
+ "epoch": 0.7689922480620155,
875
+ "grad_norm": 1.490134687848016,
876
+ "learning_rate": 1.4644660940672628e-06,
877
+ "loss": 0.3482,
878
+ "step": 124
879
+ },
880
+ {
881
+ "epoch": 0.7751937984496124,
882
+ "grad_norm": 0.2612434597373152,
883
+ "learning_rate": 1.3921526947346902e-06,
884
+ "loss": 0.3018,
885
+ "step": 125
886
+ },
887
+ {
888
+ "epoch": 0.7813953488372093,
889
+ "grad_norm": 0.25036249030908275,
890
+ "learning_rate": 1.321380446634342e-06,
891
+ "loss": 0.3188,
892
+ "step": 126
893
+ },
894
+ {
895
+ "epoch": 0.7875968992248062,
896
+ "grad_norm": 0.2612566749541528,
897
+ "learning_rate": 1.2521795812943704e-06,
898
+ "loss": 0.2995,
899
+ "step": 127
900
+ },
901
+ {
902
+ "epoch": 0.7937984496124031,
903
+ "grad_norm": 0.35381824882639945,
904
+ "learning_rate": 1.1845796590009684e-06,
905
+ "loss": 0.287,
906
+ "step": 128
907
+ },
908
+ {
909
+ "epoch": 0.8,
910
+ "grad_norm": 0.27735996036817234,
911
+ "learning_rate": 1.118609556171213e-06,
912
+ "loss": 0.2765,
913
+ "step": 129
914
+ },
915
+ {
916
+ "epoch": 0.8062015503875969,
917
+ "grad_norm": 0.2770800666877129,
918
+ "learning_rate": 1.0542974530180327e-06,
919
+ "loss": 0.3346,
920
+ "step": 130
921
+ },
922
+ {
923
+ "epoch": 0.8124031007751938,
924
+ "grad_norm": 0.2786390048536498,
925
+ "learning_rate": 9.916708215125586e-07,
926
+ "loss": 0.2544,
927
+ "step": 131
928
+ },
929
+ {
930
+ "epoch": 0.8186046511627907,
931
+ "grad_norm": 0.2502926836308708,
932
+ "learning_rate": 9.307564136490255e-07,
933
+ "loss": 0.2888,
934
+ "step": 132
935
+ },
936
+ {
937
+ "epoch": 0.8248062015503876,
938
+ "grad_norm": 0.29354461108159396,
939
+ "learning_rate": 8.715802500172215e-07,
940
+ "loss": 0.2817,
941
+ "step": 133
942
+ },
943
+ {
944
+ "epoch": 0.8310077519379845,
945
+ "grad_norm": 0.29537836943938983,
946
+ "learning_rate": 8.141676086873574e-07,
947
+ "loss": 0.2752,
948
+ "step": 134
949
+ },
950
+ {
951
+ "epoch": 0.8372093023255814,
952
+ "grad_norm": 0.25157759814716646,
953
+ "learning_rate": 7.585430144121319e-07,
954
+ "loss": 0.2687,
955
+ "step": 135
956
+ },
957
+ {
958
+ "epoch": 0.8434108527131783,
959
+ "grad_norm": 0.25520703041985415,
960
+ "learning_rate": 7.047302281505735e-07,
961
+ "loss": 0.2348,
962
+ "step": 136
963
+ },
964
+ {
965
+ "epoch": 0.8496124031007752,
966
+ "grad_norm": 0.2804280034486318,
967
+ "learning_rate": 6.527522369181655e-07,
968
+ "loss": 0.3057,
969
+ "step": 137
970
+ },
971
+ {
972
+ "epoch": 0.8558139534883721,
973
+ "grad_norm": 0.2577385480731099,
974
+ "learning_rate": 6.026312439675553e-07,
975
+ "loss": 0.2553,
976
+ "step": 138
977
+ },
978
+ {
979
+ "epoch": 0.862015503875969,
980
+ "grad_norm": 0.4111388669958174,
981
+ "learning_rate": 5.543886593040737e-07,
982
+ "loss": 0.2228,
983
+ "step": 139
984
+ },
985
+ {
986
+ "epoch": 0.8682170542635659,
987
+ "grad_norm": 0.2512038377179209,
988
+ "learning_rate": 5.080450905401057e-07,
989
+ "loss": 0.2838,
990
+ "step": 140
991
+ },
992
+ {
993
+ "epoch": 0.8744186046511628,
994
+ "grad_norm": 0.2899187916626033,
995
+ "learning_rate": 4.6362033409220077e-07,
996
+ "loss": 0.325,
997
+ "step": 141
998
+ },
999
+ {
1000
+ "epoch": 0.8806201550387597,
1001
+ "grad_norm": 0.30346956388490826,
1002
+ "learning_rate": 4.211333667247125e-07,
1003
+ "loss": 0.3548,
1004
+ "step": 142
1005
+ },
1006
+ {
1007
+ "epoch": 0.8868217054263566,
1008
+ "grad_norm": 0.25434135497158716,
1009
+ "learning_rate": 3.8060233744356634e-07,
1010
+ "loss": 0.2554,
1011
+ "step": 143
1012
+ },
1013
+ {
1014
+ "epoch": 0.8930232558139535,
1015
+ "grad_norm": 0.2718193216598721,
1016
+ "learning_rate": 3.420445597436056e-07,
1017
+ "loss": 0.2956,
1018
+ "step": 144
1019
+ },
1020
+ {
1021
+ "epoch": 0.8992248062015504,
1022
+ "grad_norm": 0.2581092232290721,
1023
+ "learning_rate": 3.0547650421285216e-07,
1024
+ "loss": 0.2697,
1025
+ "step": 145
1026
+ },
1027
+ {
1028
+ "epoch": 0.9054263565891473,
1029
+ "grad_norm": 0.2522471348373793,
1030
+ "learning_rate": 2.7091379149682683e-07,
1031
+ "loss": 0.2962,
1032
+ "step": 146
1033
+ },
1034
+ {
1035
+ "epoch": 0.9116279069767442,
1036
+ "grad_norm": 0.24916983451448027,
1037
+ "learning_rate": 2.3837118562592799e-07,
1038
+ "loss": 0.2872,
1039
+ "step": 147
1040
+ },
1041
+ {
1042
+ "epoch": 0.9178294573643411,
1043
+ "grad_norm": 0.2599299055423855,
1044
+ "learning_rate": 2.0786258770873647e-07,
1045
+ "loss": 0.2807,
1046
+ "step": 148
1047
+ },
1048
+ {
1049
+ "epoch": 0.924031007751938,
1050
+ "grad_norm": 0.2804131521355199,
1051
+ "learning_rate": 1.7940102999393194e-07,
1052
+ "loss": 0.2809,
1053
+ "step": 149
1054
+ },
1055
+ {
1056
+ "epoch": 0.9302325581395349,
1057
+ "grad_norm": 0.3172859696325972,
1058
+ "learning_rate": 1.5299867030334815e-07,
1059
+ "loss": 0.3055,
1060
+ "step": 150
1061
+ },
1062
+ {
1063
+ "epoch": 0.9364341085271318,
1064
+ "grad_norm": 0.30376879245797783,
1065
+ "learning_rate": 1.286667868385627e-07,
1066
+ "loss": 0.3647,
1067
+ "step": 151
1068
+ },
1069
+ {
1070
+ "epoch": 0.9426356589147287,
1071
+ "grad_norm": 0.26932389563925824,
1072
+ "learning_rate": 1.0641577336322761e-07,
1073
+ "loss": 0.2771,
1074
+ "step": 152
1075
+ },
1076
+ {
1077
+ "epoch": 0.9488372093023256,
1078
+ "grad_norm": 0.28886881022603444,
1079
+ "learning_rate": 8.625513476320291e-08,
1080
+ "loss": 0.3155,
1081
+ "step": 153
1082
+ },
1083
+ {
1084
+ "epoch": 0.9550387596899225,
1085
+ "grad_norm": 0.2459036254604883,
1086
+ "learning_rate": 6.819348298638839e-08,
1087
+ "loss": 0.276,
1088
+ "step": 154
1089
+ },
1090
+ {
1091
+ "epoch": 0.9612403100775194,
1092
+ "grad_norm": 0.2706567588905982,
1093
+ "learning_rate": 5.223853336398632e-08,
1094
+ "loss": 0.3116,
1095
+ "step": 155
1096
+ },
1097
+ {
1098
+ "epoch": 0.9674418604651163,
1099
+ "grad_norm": 0.27809861760850124,
1100
+ "learning_rate": 3.839710131477492e-08,
1101
+ "loss": 0.3412,
1102
+ "step": 156
1103
+ },
1104
+ {
1105
+ "epoch": 0.9736434108527132,
1106
+ "grad_norm": 0.2790075651739876,
1107
+ "learning_rate": 2.6675099433787212e-08,
1108
+ "loss": 0.3247,
1109
+ "step": 157
1110
+ },
1111
+ {
1112
+ "epoch": 0.9798449612403101,
1113
+ "grad_norm": 0.2545419908224724,
1114
+ "learning_rate": 1.7077534966650767e-08,
1115
+ "loss": 0.3451,
1116
+ "step": 158
1117
+ },
1118
+ {
1119
+ "epoch": 0.986046511627907,
1120
+ "grad_norm": 0.23665306218981744,
1121
+ "learning_rate": 9.608507670659239e-09,
1122
+ "loss": 0.2523,
1123
+ "step": 159
1124
+ },
1125
+ {
1126
+ "epoch": 0.9922480620155039,
1127
+ "grad_norm": 0.28653358687035707,
1128
+ "learning_rate": 4.2712080634949024e-09,
1129
+ "loss": 0.2768,
1130
+ "step": 160
1131
+ },
1132
+ {
1133
+ "epoch": 0.9984496124031008,
1134
+ "grad_norm": 0.258292218409734,
1135
+ "learning_rate": 1.0679160603449533e-09,
1136
+ "loss": 0.353,
1137
+ "step": 161
1138
+ },
1139
+ {
1140
+ "epoch": 0.9984496124031008,
1141
+ "step": 161,
1142
+ "total_flos": 429783955537920.0,
1143
+ "train_loss": 0.41365621706343586,
1144
+ "train_runtime": 4919.9829,
1145
+ "train_samples_per_second": 2.097,
1146
+ "train_steps_per_second": 0.033
1147
+ }
1148
+ ],
1149
+ "logging_steps": 1.0,
1150
+ "max_steps": 161,
1151
+ "num_input_tokens_seen": 0,
1152
+ "num_train_epochs": 1,
1153
+ "save_steps": 500,
1154
+ "stateful_callbacks": {
1155
+ "TrainerControl": {
1156
+ "args": {
1157
+ "should_epoch_stop": false,
1158
+ "should_evaluate": false,
1159
+ "should_log": false,
1160
+ "should_save": true,
1161
+ "should_training_stop": true
1162
+ },
1163
+ "attributes": {}
1164
+ }
1165
+ },
1166
+ "total_flos": 429783955537920.0,
1167
+ "train_batch_size": 1,
1168
+ "trial_name": null,
1169
+ "trial_params": null
1170
+ }