File size: 12,342 Bytes
d9ffe28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 1.8209806157354618,
  "eval_steps": 500,
  "global_step": 200,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.04561003420752566,
      "grad_norm": 1.0824084281921387,
      "learning_rate": 4.9981876195011844e-05,
      "loss": 0.4811,
      "num_input_tokens_seen": 110848,
      "step": 5,
      "train_runtime": 30.3319,
      "train_tokens_per_second": 3654.501
    },
    {
      "epoch": 0.09122006841505131,
      "grad_norm": 0.6754106879234314,
      "learning_rate": 4.9908293271567286e-05,
      "loss": 0.2124,
      "num_input_tokens_seen": 220880,
      "step": 10,
      "train_runtime": 60.3932,
      "train_tokens_per_second": 3657.363
    },
    {
      "epoch": 0.13683010262257697,
      "grad_norm": 0.39871442317962646,
      "learning_rate": 4.977828505250903e-05,
      "loss": 0.1393,
      "num_input_tokens_seen": 332160,
      "step": 15,
      "train_runtime": 90.9477,
      "train_tokens_per_second": 3652.209
    },
    {
      "epoch": 0.18244013683010263,
      "grad_norm": 0.3741743564605713,
      "learning_rate": 4.959214604826831e-05,
      "loss": 0.1043,
      "num_input_tokens_seen": 442608,
      "step": 20,
      "train_runtime": 121.2095,
      "train_tokens_per_second": 3651.596
    },
    {
      "epoch": 0.22805017103762829,
      "grad_norm": 0.43203699588775635,
      "learning_rate": 4.935029792355834e-05,
      "loss": 0.0953,
      "num_input_tokens_seen": 552768,
      "step": 25,
      "train_runtime": 151.3115,
      "train_tokens_per_second": 3653.179
    },
    {
      "epoch": 0.27366020524515394,
      "grad_norm": 0.40135976672172546,
      "learning_rate": 4.9053288542168185e-05,
      "loss": 0.0867,
      "num_input_tokens_seen": 663056,
      "step": 30,
      "train_runtime": 181.4665,
      "train_tokens_per_second": 3653.876
    },
    {
      "epoch": 0.31927023945267957,
      "grad_norm": 0.33862611651420593,
      "learning_rate": 4.870179072587499e-05,
      "loss": 0.0625,
      "num_input_tokens_seen": 773936,
      "step": 35,
      "train_runtime": 211.7267,
      "train_tokens_per_second": 3655.353
    },
    {
      "epoch": 0.36488027366020526,
      "grad_norm": 0.41058799624443054,
      "learning_rate": 4.829660073028631e-05,
      "loss": 0.0727,
      "num_input_tokens_seen": 884832,
      "step": 40,
      "train_runtime": 241.9754,
      "train_tokens_per_second": 3656.702
    },
    {
      "epoch": 0.4104903078677309,
      "grad_norm": 0.5706074833869934,
      "learning_rate": 4.783863644106502e-05,
      "loss": 0.0727,
      "num_input_tokens_seen": 995744,
      "step": 45,
      "train_runtime": 272.301,
      "train_tokens_per_second": 3656.777
    },
    {
      "epoch": 0.45610034207525657,
      "grad_norm": 0.31977686285972595,
      "learning_rate": 4.73289352946231e-05,
      "loss": 0.0676,
      "num_input_tokens_seen": 1106256,
      "step": 50,
      "train_runtime": 302.3755,
      "train_tokens_per_second": 3658.551
    },
    {
      "epoch": 0.5017103762827823,
      "grad_norm": 0.29765012860298157,
      "learning_rate": 4.6768651927994434e-05,
      "loss": 0.0635,
      "num_input_tokens_seen": 1217840,
      "step": 55,
      "train_runtime": 332.9163,
      "train_tokens_per_second": 3658.097
    },
    {
      "epoch": 0.5473204104903079,
      "grad_norm": 0.3982709050178528,
      "learning_rate": 4.6159055563210604e-05,
      "loss": 0.0586,
      "num_input_tokens_seen": 1328544,
      "step": 60,
      "train_runtime": 363.1815,
      "train_tokens_per_second": 3658.072
    },
    {
      "epoch": 0.5929304446978335,
      "grad_norm": 0.24241122603416443,
      "learning_rate": 4.550152713210478e-05,
      "loss": 0.0572,
      "num_input_tokens_seen": 1438848,
      "step": 65,
      "train_runtime": 393.3602,
      "train_tokens_per_second": 3657.838
    },
    {
      "epoch": 0.6385404789053591,
      "grad_norm": 0.2856714427471161,
      "learning_rate": 4.479755614805688e-05,
      "loss": 0.054,
      "num_input_tokens_seen": 1547904,
      "step": 70,
      "train_runtime": 423.1722,
      "train_tokens_per_second": 3657.858
    },
    {
      "epoch": 0.6841505131128849,
      "grad_norm": 0.3355858325958252,
      "learning_rate": 4.404873733176678e-05,
      "loss": 0.047,
      "num_input_tokens_seen": 1658736,
      "step": 75,
      "train_runtime": 453.4813,
      "train_tokens_per_second": 3657.782
    },
    {
      "epoch": 0.7297605473204105,
      "grad_norm": 0.3131825029850006,
      "learning_rate": 4.3256766998698936e-05,
      "loss": 0.0471,
      "num_input_tokens_seen": 1769248,
      "step": 80,
      "train_runtime": 483.7293,
      "train_tokens_per_second": 3657.517
    },
    {
      "epoch": 0.7753705815279361,
      "grad_norm": 0.2504868805408478,
      "learning_rate": 4.242343921638234e-05,
      "loss": 0.0408,
      "num_input_tokens_seen": 1879024,
      "step": 85,
      "train_runtime": 513.7413,
      "train_tokens_per_second": 3657.529
    },
    {
      "epoch": 0.8209806157354618,
      "grad_norm": 0.4044341742992401,
      "learning_rate": 4.155064174027047e-05,
      "loss": 0.0398,
      "num_input_tokens_seen": 1991024,
      "step": 90,
      "train_runtime": 544.3342,
      "train_tokens_per_second": 3657.724
    },
    {
      "epoch": 0.8665906499429875,
      "grad_norm": 0.3132927119731903,
      "learning_rate": 4.064035173736804e-05,
      "loss": 0.0369,
      "num_input_tokens_seen": 2102064,
      "step": 95,
      "train_runtime": 574.6371,
      "train_tokens_per_second": 3658.072
    },
    {
      "epoch": 0.9122006841505131,
      "grad_norm": 0.3075104057788849,
      "learning_rate": 3.969463130731183e-05,
      "loss": 0.0408,
      "num_input_tokens_seen": 2212432,
      "step": 100,
      "train_runtime": 604.7406,
      "train_tokens_per_second": 3658.481
    },
    {
      "epoch": 0.9578107183580388,
      "grad_norm": 0.3352231979370117,
      "learning_rate": 3.871562281105175e-05,
      "loss": 0.0441,
      "num_input_tokens_seen": 2322032,
      "step": 105,
      "train_runtime": 635.488,
      "train_tokens_per_second": 3653.936
    },
    {
      "epoch": 1.0,
      "grad_norm": 0.36991724371910095,
      "learning_rate": 3.770554401771423e-05,
      "loss": 0.0356,
      "num_input_tokens_seen": 2423872,
      "step": 110,
      "train_runtime": 663.2539,
      "train_tokens_per_second": 3654.516
    },
    {
      "epoch": 1.0456100342075256,
      "grad_norm": 0.31266242265701294,
      "learning_rate": 3.6666683080641846e-05,
      "loss": 0.0358,
      "num_input_tokens_seen": 2534752,
      "step": 115,
      "train_runtime": 693.4736,
      "train_tokens_per_second": 3655.153
    },
    {
      "epoch": 1.0912200684150513,
      "grad_norm": 0.29535725712776184,
      "learning_rate": 3.5601393353990046e-05,
      "loss": 0.0388,
      "num_input_tokens_seen": 2644464,
      "step": 120,
      "train_runtime": 723.4647,
      "train_tokens_per_second": 3655.277
    },
    {
      "epoch": 1.1368301026225769,
      "grad_norm": 0.37989553809165955,
      "learning_rate": 3.4512088061623075e-05,
      "loss": 0.0323,
      "num_input_tokens_seen": 2755568,
      "step": 125,
      "train_runtime": 753.8099,
      "train_tokens_per_second": 3655.521
    },
    {
      "epoch": 1.1824401368301025,
      "grad_norm": 0.2450297474861145,
      "learning_rate": 3.3401234830385756e-05,
      "loss": 0.031,
      "num_input_tokens_seen": 2865744,
      "step": 130,
      "train_runtime": 783.9325,
      "train_tokens_per_second": 3655.6
    },
    {
      "epoch": 1.2280501710376284,
      "grad_norm": 0.35642582178115845,
      "learning_rate": 3.2271350100134975e-05,
      "loss": 0.0279,
      "num_input_tokens_seen": 2976304,
      "step": 135,
      "train_runtime": 814.2192,
      "train_tokens_per_second": 3655.409
    },
    {
      "epoch": 1.273660205245154,
      "grad_norm": 0.34205862879753113,
      "learning_rate": 3.11249934231941e-05,
      "loss": 0.0285,
      "num_input_tokens_seen": 3086224,
      "step": 140,
      "train_runtime": 844.255,
      "train_tokens_per_second": 3655.559
    },
    {
      "epoch": 1.3192702394526796,
      "grad_norm": 0.4949014484882355,
      "learning_rate": 2.996476166614364e-05,
      "loss": 0.0292,
      "num_input_tokens_seen": 3196352,
      "step": 145,
      "train_runtime": 874.2926,
      "train_tokens_per_second": 3655.929
    },
    {
      "epoch": 1.3648802736602053,
      "grad_norm": 0.3114229142665863,
      "learning_rate": 2.8793283127083292e-05,
      "loss": 0.0299,
      "num_input_tokens_seen": 3307744,
      "step": 150,
      "train_runtime": 904.5859,
      "train_tokens_per_second": 3656.639
    },
    {
      "epoch": 1.4104903078677309,
      "grad_norm": 0.3953075706958771,
      "learning_rate": 2.761321158169134e-05,
      "loss": 0.0345,
      "num_input_tokens_seen": 3418016,
      "step": 155,
      "train_runtime": 934.6229,
      "train_tokens_per_second": 3657.107
    },
    {
      "epoch": 1.4561003420752565,
      "grad_norm": 0.5266784429550171,
      "learning_rate": 2.6427220271569203e-05,
      "loss": 0.0338,
      "num_input_tokens_seen": 3528544,
      "step": 160,
      "train_runtime": 964.715,
      "train_tokens_per_second": 3657.602
    },
    {
      "epoch": 1.5017103762827824,
      "grad_norm": 0.4755711853504181,
      "learning_rate": 2.523799584848942e-05,
      "loss": 0.0363,
      "num_input_tokens_seen": 3639056,
      "step": 165,
      "train_runtime": 994.801,
      "train_tokens_per_second": 3658.074
    },
    {
      "epoch": 1.547320410490308,
      "grad_norm": 0.4198484420776367,
      "learning_rate": 2.4048232288265253e-05,
      "loss": 0.0294,
      "num_input_tokens_seen": 3749824,
      "step": 170,
      "train_runtime": 1025.0451,
      "train_tokens_per_second": 3658.204
    },
    {
      "epoch": 1.5929304446978336,
      "grad_norm": 0.40441790223121643,
      "learning_rate": 2.2860624788029013e-05,
      "loss": 0.0334,
      "num_input_tokens_seen": 3859776,
      "step": 175,
      "train_runtime": 1055.0684,
      "train_tokens_per_second": 3658.318
    },
    {
      "epoch": 1.6385404789053593,
      "grad_norm": 0.7088226079940796,
      "learning_rate": 2.167786366074365e-05,
      "loss": 0.0296,
      "num_input_tokens_seen": 3970576,
      "step": 180,
      "train_runtime": 1085.3469,
      "train_tokens_per_second": 3658.347
    },
    {
      "epoch": 1.6841505131128849,
      "grad_norm": 0.40556278824806213,
      "learning_rate": 2.0502628240778655e-05,
      "loss": 0.0319,
      "num_input_tokens_seen": 4081568,
      "step": 185,
      "train_runtime": 1115.6872,
      "train_tokens_per_second": 3658.344
    },
    {
      "epoch": 1.7297605473204105,
      "grad_norm": 0.32311075925827026,
      "learning_rate": 1.9337580814355888e-05,
      "loss": 0.0324,
      "num_input_tokens_seen": 4191264,
      "step": 190,
      "train_runtime": 1145.6081,
      "train_tokens_per_second": 3658.55
    },
    {
      "epoch": 1.7753705815279361,
      "grad_norm": 0.3284579813480377,
      "learning_rate": 1.8185360588615058e-05,
      "loss": 0.0337,
      "num_input_tokens_seen": 4301440,
      "step": 195,
      "train_runtime": 1175.6929,
      "train_tokens_per_second": 3658.642
    },
    {
      "epoch": 1.8209806157354618,
      "grad_norm": 0.2036353498697281,
      "learning_rate": 1.7048577712960627e-05,
      "loss": 0.0356,
      "num_input_tokens_seen": 4412000,
      "step": 200,
      "train_runtime": 1206.0273,
      "train_tokens_per_second": 3658.292
    }
  ],
  "logging_steps": 5,
  "max_steps": 330,
  "num_input_tokens_seen": 4412000,
  "num_train_epochs": 3,
  "save_steps": 100,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 1.99225523994624e+17,
  "train_batch_size": 2,
  "trial_name": null,
  "trial_params": null
}