terry69 commited on
Commit
4691054
·
verified ·
1 Parent(s): 3095b25

Model save

Browse files
README.md CHANGED
@@ -2,13 +2,10 @@
2
  license: llama2
3
  library_name: peft
4
  tags:
5
- - alignment-handbook
6
  - trl
7
  - sft
8
  - generated_from_trainer
9
  base_model: meta-llama/Llama-2-7b-hf
10
- datasets:
11
- - HuggingFaceH4/ultrachat_200k
12
  model-index:
13
  - name: llama2-20p-POE
14
  results: []
@@ -19,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
19
 
20
  # llama2-20p-POE
21
 
22
- This model is a fine-tuned version of [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf) on the HuggingFaceH4/ultrachat_200k dataset.
23
  It achieves the following results on the evaluation set:
24
  - Loss: nan
25
 
@@ -58,7 +55,7 @@ The following hyperparameters were used during training:
58
 
59
  | Training Loss | Epoch | Step | Validation Loss |
60
  |:-------------:|:-----:|:----:|:---------------:|
61
- | 0.7591 | 1.0 | 675 | nan |
62
 
63
 
64
  ### Framework versions
 
2
  license: llama2
3
  library_name: peft
4
  tags:
 
5
  - trl
6
  - sft
7
  - generated_from_trainer
8
  base_model: meta-llama/Llama-2-7b-hf
 
 
9
  model-index:
10
  - name: llama2-20p-POE
11
  results: []
 
16
 
17
  # llama2-20p-POE
18
 
19
+ This model is a fine-tuned version of [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf) on the None dataset.
20
  It achieves the following results on the evaluation set:
21
  - Loss: nan
22
 
 
55
 
56
  | Training Loss | Epoch | Step | Validation Loss |
57
  |:-------------:|:-----:|:----:|:---------------:|
58
+ | 0.7327 | 1.0 | 1039 | nan |
59
 
60
 
61
  ### Framework versions
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d370cf8d66f04acedfbae1d4d8d05426bcf20615ba409e1ded0bd718bca76c0f
3
  size 60089544
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af8b59a0f339223195ffda22e2cc190e5b99e802340991131a98cce499515eaf
3
  size 60089544
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 0.7647893634548893,
4
- "train_runtime": 21987.6737,
5
- "train_samples": 21594,
6
- "train_samples_per_second": 0.982,
7
- "train_steps_per_second": 0.031
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.7603742487735766,
4
+ "train_runtime": 32307.7024,
5
+ "train_samples": 33257,
6
+ "train_samples_per_second": 1.029,
7
+ "train_steps_per_second": 0.032
8
  }
runs/May03_06-09-37_ip-172-31-69-60.ec2.internal/events.out.tfevents.1714716718.ip-172-31-69-60.ec2.internal.2066.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:962e7c820913d857a21050000793e4752d88da1e40357a60fd8544edbf849d8a
3
- size 47019
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38dcc78b365fe2e878562c93604823a359f8f1c3842b73d7ec15f47d7c201acf
3
+ size 49121
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "epoch": 1.0,
3
- "train_loss": 0.7647893634548893,
4
- "train_runtime": 21987.6737,
5
- "train_samples": 21594,
6
- "train_samples_per_second": 0.982,
7
- "train_steps_per_second": 0.031
8
  }
 
1
  {
2
  "epoch": 1.0,
3
+ "train_loss": 0.7603742487735766,
4
+ "train_runtime": 32307.7024,
5
+ "train_samples": 33257,
6
+ "train_samples_per_second": 1.029,
7
+ "train_steps_per_second": 0.032
8
  }
trainer_state.json CHANGED
@@ -1,989 +1,1493 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.0,
5
  "eval_steps": 500,
6
- "global_step": 675,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
- "grad_norm": 0.028420851102407323,
14
- "learning_rate": 2.9411764705882355e-06,
15
- "loss": 0.8769,
16
  "step": 1
17
  },
18
  {
19
- "epoch": 0.01,
20
- "grad_norm": 0.02533437087421799,
21
- "learning_rate": 1.4705882352941177e-05,
22
- "loss": 0.863,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.01,
27
- "grad_norm": 0.029598127358419903,
28
- "learning_rate": 2.9411764705882354e-05,
29
- "loss": 0.8899,
30
  "step": 10
31
  },
32
  {
33
- "epoch": 0.02,
34
- "grad_norm": 0.04418645965251518,
35
- "learning_rate": 4.411764705882353e-05,
36
- "loss": 0.8643,
37
  "step": 15
38
  },
39
  {
40
- "epoch": 0.03,
41
- "grad_norm": 0.08391069341850195,
42
- "learning_rate": 5.882352941176471e-05,
43
- "loss": 0.8164,
44
  "step": 20
45
  },
46
  {
47
- "epoch": 0.04,
48
- "grad_norm": 0.0848430702291638,
49
- "learning_rate": 7.352941176470589e-05,
50
- "loss": 0.838,
51
  "step": 25
52
  },
53
  {
54
- "epoch": 0.04,
55
- "grad_norm": 0.07340453634110397,
56
- "learning_rate": 8.823529411764706e-05,
57
- "loss": 0.8371,
58
  "step": 30
59
  },
60
  {
61
- "epoch": 0.05,
62
- "grad_norm": 0.060721401358425686,
63
- "learning_rate": 0.00010294117647058823,
64
- "loss": 0.7967,
65
  "step": 35
66
  },
67
  {
68
- "epoch": 0.06,
69
- "grad_norm": 0.06406699741542068,
70
- "learning_rate": 0.00011764705882352942,
71
- "loss": 0.7952,
72
  "step": 40
73
  },
74
  {
75
- "epoch": 0.07,
76
- "grad_norm": 0.0702367088144725,
77
- "learning_rate": 0.0001323529411764706,
78
- "loss": 0.7785,
79
  "step": 45
80
  },
81
  {
82
- "epoch": 0.07,
83
- "grad_norm": 0.05482720735795007,
84
- "learning_rate": 0.00014705882352941178,
85
- "loss": 0.7855,
86
  "step": 50
87
  },
88
  {
89
- "epoch": 0.08,
90
- "grad_norm": 0.061311996254463264,
91
- "learning_rate": 0.00016176470588235295,
92
- "loss": 0.7836,
93
  "step": 55
94
  },
95
  {
96
- "epoch": 0.09,
97
- "grad_norm": 0.057997545698835314,
98
- "learning_rate": 0.00017647058823529413,
99
- "loss": 0.7932,
100
  "step": 60
101
  },
102
  {
103
- "epoch": 0.1,
104
- "grad_norm": 0.04913048738654215,
105
- "learning_rate": 0.0001911764705882353,
106
- "loss": 0.7746,
107
  "step": 65
108
  },
109
  {
110
- "epoch": 0.1,
111
- "grad_norm": 0.051276736158220364,
112
- "learning_rate": 0.00019999464266898484,
113
- "loss": 0.7464,
114
  "step": 70
115
  },
116
  {
117
- "epoch": 0.11,
118
- "grad_norm": 0.04995736942327917,
119
- "learning_rate": 0.00019993437928712978,
120
- "loss": 0.7248,
121
  "step": 75
122
  },
123
  {
124
- "epoch": 0.12,
125
- "grad_norm": 0.045680340036933116,
126
- "learning_rate": 0.0001998071963486563,
127
- "loss": 0.7855,
128
  "step": 80
129
  },
130
  {
131
- "epoch": 0.13,
132
- "grad_norm": 0.053344671279534676,
133
- "learning_rate": 0.00019961317901970953,
134
- "loss": 0.7396,
135
  "step": 85
136
  },
137
  {
138
- "epoch": 0.13,
139
- "grad_norm": 0.05347208673468481,
140
- "learning_rate": 0.0001993524572210807,
141
- "loss": 0.7623,
142
  "step": 90
143
  },
144
  {
145
- "epoch": 0.14,
146
- "grad_norm": 0.044577349788325,
147
- "learning_rate": 0.00019902520554120772,
148
- "loss": 0.7595,
149
  "step": 95
150
  },
151
  {
152
- "epoch": 0.15,
153
- "grad_norm": 0.04633173399608109,
154
- "learning_rate": 0.00019863164311926433,
155
- "loss": 0.7759,
156
  "step": 100
157
  },
158
  {
159
- "epoch": 0.16,
160
- "grad_norm": 0.044125485710741395,
161
- "learning_rate": 0.00019817203349841738,
162
- "loss": 0.7858,
163
  "step": 105
164
  },
165
  {
166
- "epoch": 0.16,
167
- "grad_norm": 0.041739410620092,
168
- "learning_rate": 0.00019764668444934854,
169
- "loss": 0.7682,
170
  "step": 110
171
  },
172
  {
173
- "epoch": 0.17,
174
- "grad_norm": 0.046548199607491646,
175
- "learning_rate": 0.0001970559477641606,
176
- "loss": 0.7442,
177
  "step": 115
178
  },
179
  {
180
- "epoch": 0.18,
181
- "grad_norm": 0.04287304549804181,
182
- "learning_rate": 0.0001964002190208052,
183
- "loss": 0.7966,
184
  "step": 120
185
  },
186
  {
187
- "epoch": 0.19,
188
- "grad_norm": 0.04461470993270133,
189
- "learning_rate": 0.00019567993731818984,
190
- "loss": 0.7678,
191
  "step": 125
192
  },
193
  {
194
- "epoch": 0.19,
195
- "grad_norm": 0.039705545161659035,
196
- "learning_rate": 0.00019489558498214196,
197
- "loss": 0.7345,
198
  "step": 130
199
  },
200
  {
201
- "epoch": 0.2,
202
- "grad_norm": 0.03330898777260923,
203
- "learning_rate": 0.00019404768724242666,
204
- "loss": 0.7714,
205
  "step": 135
206
  },
207
  {
208
- "epoch": 0.21,
209
- "grad_norm": 0.046594949440474036,
210
- "learning_rate": 0.00019313681188103457,
211
- "loss": 0.7757,
212
  "step": 140
213
  },
214
  {
215
- "epoch": 0.21,
216
- "grad_norm": 0.04928885679685318,
217
- "learning_rate": 0.000192163568851975,
218
- "loss": 0.8217,
219
  "step": 145
220
  },
221
  {
222
- "epoch": 0.22,
223
- "grad_norm": 0.04466472170425771,
224
- "learning_rate": 0.00019112860987282958,
225
- "loss": 0.7356,
226
  "step": 150
227
  },
228
  {
229
- "epoch": 0.23,
230
- "grad_norm": 0.05078422739553919,
231
- "learning_rate": 0.0001900326279883392,
232
- "loss": 0.7262,
233
  "step": 155
234
  },
235
  {
236
- "epoch": 0.24,
237
- "grad_norm": 0.042393502583193486,
238
- "learning_rate": 0.00018887635710631716,
239
- "loss": 0.791,
240
  "step": 160
241
  },
242
  {
243
- "epoch": 0.24,
244
- "grad_norm": 0.04096339138017866,
245
- "learning_rate": 0.00018766057150619865,
246
- "loss": 0.7621,
247
  "step": 165
248
  },
249
  {
250
- "epoch": 0.25,
251
- "grad_norm": 0.04894911531750606,
252
- "learning_rate": 0.00018638608532055634,
253
- "loss": 0.714,
254
  "step": 170
255
  },
256
  {
257
- "epoch": 0.26,
258
- "grad_norm": 0.04424627496496155,
259
- "learning_rate": 0.00018505375198992857,
260
- "loss": 0.7445,
261
  "step": 175
262
  },
263
  {
264
- "epoch": 0.27,
265
- "grad_norm": 0.05064439962306937,
266
- "learning_rate": 0.00018366446369132578,
267
- "loss": 0.7502,
268
  "step": 180
269
  },
270
  {
271
- "epoch": 0.27,
272
- "grad_norm": 0.05185726609274544,
273
- "learning_rate": 0.00018221915074079762,
274
- "loss": 0.7423,
275
  "step": 185
276
  },
277
  {
278
- "epoch": 0.28,
279
- "grad_norm": 0.049634018260632524,
280
- "learning_rate": 0.00018071878097046065,
281
- "loss": 0.7853,
282
  "step": 190
283
  },
284
  {
285
- "epoch": 0.29,
286
- "grad_norm": 0.03718521878894617,
287
- "learning_rate": 0.00017916435908040413,
288
- "loss": 0.7575,
289
  "step": 195
290
  },
291
  {
292
- "epoch": 0.3,
293
- "grad_norm": 0.054866103106943676,
294
- "learning_rate": 0.00017755692596590778,
295
- "loss": 0.7604,
296
  "step": 200
297
  },
298
  {
299
- "epoch": 0.3,
300
- "grad_norm": 0.040017034968621745,
301
- "learning_rate": 0.00017589755802042186,
302
- "loss": 0.7818,
303
  "step": 205
304
  },
305
  {
306
- "epoch": 0.31,
307
- "grad_norm": 0.03964997679073274,
308
- "learning_rate": 0.00017418736641477636,
309
- "loss": 0.7464,
310
  "step": 210
311
  },
312
  {
313
- "epoch": 0.32,
314
- "grad_norm": 0.051157610923925706,
315
- "learning_rate": 0.0001724274963531022,
316
- "loss": 0.7534,
317
  "step": 215
318
  },
319
  {
320
- "epoch": 0.33,
321
- "grad_norm": 0.04692776206415383,
322
- "learning_rate": 0.00017061912630596252,
323
- "loss": 0.7862,
324
  "step": 220
325
  },
326
  {
327
- "epoch": 0.33,
328
- "grad_norm": 0.04009778793971981,
329
- "learning_rate": 0.00016876346722120747,
330
- "loss": 0.7619,
331
  "step": 225
332
  },
333
  {
334
- "epoch": 0.34,
335
- "grad_norm": 0.037858593687305236,
336
- "learning_rate": 0.00016686176171308126,
337
- "loss": 0.7822,
338
  "step": 230
339
  },
340
  {
341
- "epoch": 0.35,
342
- "grad_norm": 0.03514517489146636,
343
- "learning_rate": 0.0001649152832301241,
344
- "loss": 0.7475,
345
  "step": 235
346
  },
347
  {
348
- "epoch": 0.36,
349
- "grad_norm": 0.043964485334365984,
350
- "learning_rate": 0.00016292533520242662,
351
- "loss": 0.775,
352
  "step": 240
353
  },
354
  {
355
- "epoch": 0.36,
356
- "grad_norm": 0.06121852032774167,
357
- "learning_rate": 0.00016089325016880736,
358
- "loss": 0.7501,
359
  "step": 245
360
  },
361
  {
362
- "epoch": 0.37,
363
- "grad_norm": 0.050365919886299945,
364
- "learning_rate": 0.0001588203888844982,
365
- "loss": 0.7498,
366
  "step": 250
367
  },
368
  {
369
- "epoch": 0.38,
370
- "grad_norm": 0.04601818654614394,
371
- "learning_rate": 0.00015670813940993502,
372
- "loss": 0.7942,
373
  "step": 255
374
  },
375
  {
376
- "epoch": 0.39,
377
- "grad_norm": 0.049451503331748733,
378
- "learning_rate": 0.00015455791618126404,
379
- "loss": 0.7232,
380
  "step": 260
381
  },
382
  {
383
- "epoch": 0.39,
384
- "grad_norm": 0.049704756401709786,
385
- "learning_rate": 0.00015237115906318563,
386
- "loss": 0.7474,
387
  "step": 265
388
  },
389
  {
390
- "epoch": 0.4,
391
- "grad_norm": 0.043536870823896914,
392
- "learning_rate": 0.0001501493323847707,
393
- "loss": 0.7074,
394
  "step": 270
395
  },
396
  {
397
- "epoch": 0.41,
398
- "grad_norm": 0.052592436248192806,
399
- "learning_rate": 0.00014789392395889468,
400
- "loss": 0.7675,
401
  "step": 275
402
  },
403
  {
404
- "epoch": 0.41,
405
- "grad_norm": 0.04641887287230184,
406
- "learning_rate": 0.00014560644408594602,
407
- "loss": 0.7884,
408
  "step": 280
409
  },
410
  {
411
- "epoch": 0.42,
412
- "grad_norm": 0.03778851947891411,
413
- "learning_rate": 0.0001432884245424761,
414
- "loss": 0.7364,
415
  "step": 285
416
  },
417
  {
418
- "epoch": 0.43,
419
- "grad_norm": 0.04383653972641628,
420
- "learning_rate": 0.00014094141755546815,
421
- "loss": 0.7633,
422
  "step": 290
423
  },
424
  {
425
- "epoch": 0.44,
426
- "grad_norm": 0.04958511831355967,
427
- "learning_rate": 0.00013856699476291176,
428
- "loss": 0.7254,
429
  "step": 295
430
  },
431
  {
432
- "epoch": 0.44,
433
- "grad_norm": 0.047545677145208354,
434
- "learning_rate": 0.000136166746161379,
435
- "loss": 0.7389,
436
  "step": 300
437
  },
438
  {
439
- "epoch": 0.45,
440
- "grad_norm": 0.049601892362158714,
441
- "learning_rate": 0.00013374227904130724,
442
- "loss": 0.7298,
443
  "step": 305
444
  },
445
  {
446
- "epoch": 0.46,
447
- "grad_norm": 0.039019104385466755,
448
- "learning_rate": 0.00013129521691070107,
449
- "loss": 0.7372,
450
  "step": 310
451
  },
452
  {
453
- "epoch": 0.47,
454
- "grad_norm": 0.04324309132836927,
455
- "learning_rate": 0.00012882719840797473,
456
- "loss": 0.7586,
457
  "step": 315
458
  },
459
  {
460
- "epoch": 0.47,
461
- "grad_norm": 0.04511381039375704,
462
- "learning_rate": 0.0001263398762046623,
463
- "loss": 0.782,
464
  "step": 320
465
  },
466
  {
467
- "epoch": 0.48,
468
- "grad_norm": 0.037065274891104005,
469
- "learning_rate": 0.00012383491589873123,
470
- "loss": 0.73,
471
  "step": 325
472
  },
473
  {
474
- "epoch": 0.49,
475
- "grad_norm": 0.04379387246767109,
476
- "learning_rate": 0.0001213139948992394,
477
- "loss": 0.7602,
478
  "step": 330
479
  },
480
  {
481
- "epoch": 0.5,
482
- "grad_norm": 0.04868933738312991,
483
- "learning_rate": 0.0001187788013030837,
484
- "loss": 0.7467,
485
  "step": 335
486
  },
487
  {
488
- "epoch": 0.5,
489
- "grad_norm": 0.04841745199938846,
490
- "learning_rate": 0.00011623103276459086,
491
- "loss": 0.7862,
492
  "step": 340
493
  },
494
  {
495
- "epoch": 0.51,
496
- "grad_norm": 0.046825753709491394,
497
- "learning_rate": 0.00011367239535870913,
498
- "loss": 0.7523,
499
  "step": 345
500
  },
501
  {
502
- "epoch": 0.52,
503
- "grad_norm": 0.05204047537850423,
504
- "learning_rate": 0.00011110460243856052,
505
- "loss": 0.721,
506
  "step": 350
507
  },
508
  {
509
- "epoch": 0.53,
510
- "grad_norm": 0.04835371095843328,
511
- "learning_rate": 0.0001085293734881197,
512
- "loss": 0.8165,
513
  "step": 355
514
  },
515
  {
516
- "epoch": 0.53,
517
- "grad_norm": 0.046503091391954944,
518
- "learning_rate": 0.00010594843297078737,
519
- "loss": 0.7151,
520
  "step": 360
521
  },
522
  {
523
- "epoch": 0.54,
524
- "grad_norm": 0.05401067419624875,
525
- "learning_rate": 0.00010336350917462925,
526
- "loss": 0.7623,
527
  "step": 365
528
  },
529
  {
530
- "epoch": 0.55,
531
- "grad_norm": 0.046238914587313926,
532
- "learning_rate": 0.00010077633305505403,
533
- "loss": 0.7952,
534
  "step": 370
535
  },
536
  {
537
- "epoch": 0.56,
538
- "grad_norm": 0.04067724976292184,
539
- "learning_rate": 9.818863707570475e-05,
540
- "loss": 0.7509,
541
  "step": 375
542
  },
543
  {
544
- "epoch": 0.56,
545
- "grad_norm": 0.041537242387637924,
546
- "learning_rate": 9.560215404834095e-05,
547
- "loss": 0.7121,
548
  "step": 380
549
  },
550
  {
551
- "epoch": 0.57,
552
- "grad_norm": 0.042556692843415594,
553
- "learning_rate": 9.30186159724869e-05,
554
- "loss": 0.7708,
555
  "step": 385
556
  },
557
  {
558
- "epoch": 0.58,
559
- "grad_norm": 0.04846970178587132,
560
- "learning_rate": 9.043975287562441e-05,
561
- "loss": 0.7258,
562
  "step": 390
563
  },
564
  {
565
- "epoch": 0.59,
566
- "grad_norm": 0.062274515472467,
567
- "learning_rate": 8.786729165470584e-05,
568
- "loss": 0.7698,
569
  "step": 395
570
  },
571
  {
572
- "epoch": 0.59,
573
- "grad_norm": 0.05082087012341951,
574
- "learning_rate": 8.530295491976337e-05,
575
- "loss": 0.7717,
576
  "step": 400
577
  },
578
  {
579
- "epoch": 0.6,
580
- "grad_norm": 0.04831882202604005,
581
- "learning_rate": 8.274845984038916e-05,
582
- "loss": 0.7679,
583
  "step": 405
584
  },
585
  {
586
- "epoch": 0.61,
587
- "grad_norm": 0.051680028901517745,
588
- "learning_rate": 8.020551699585842e-05,
589
- "loss": 0.7265,
590
  "step": 410
591
  },
592
  {
593
- "epoch": 0.61,
594
- "grad_norm": 0.051175873784970766,
595
- "learning_rate": 7.76758292296659e-05,
596
- "loss": 0.7696,
597
  "step": 415
598
  },
599
  {
600
- "epoch": 0.62,
601
- "grad_norm": 0.06738625933727418,
602
- "learning_rate": 7.516109050924201e-05,
603
- "loss": 0.7781,
604
  "step": 420
605
  },
606
  {
607
- "epoch": 0.63,
608
- "grad_norm": 0.05117489109997125,
609
- "learning_rate": 7.266298479161318e-05,
610
- "loss": 0.771,
611
  "step": 425
612
  },
613
  {
614
- "epoch": 0.64,
615
- "grad_norm": 0.04211785915218291,
616
- "learning_rate": 7.01831848957653e-05,
617
- "loss": 0.7368,
618
  "step": 430
619
  },
620
  {
621
- "epoch": 0.64,
622
- "grad_norm": 0.049459639807936356,
623
- "learning_rate": 6.772335138246548e-05,
624
- "loss": 0.7234,
625
  "step": 435
626
  },
627
  {
628
- "epoch": 0.65,
629
- "grad_norm": 0.05981206570373218,
630
- "learning_rate": 6.528513144229255e-05,
631
- "loss": 0.7063,
632
  "step": 440
633
  },
634
  {
635
- "epoch": 0.66,
636
- "grad_norm": 0.042858944283255956,
637
- "learning_rate": 6.287015779262064e-05,
638
- "loss": 0.7178,
639
  "step": 445
640
  },
641
  {
642
- "epoch": 0.67,
643
- "grad_norm": 0.051605890455677136,
644
- "learning_rate": 6.048004758429451e-05,
645
- "loss": 0.8009,
646
  "step": 450
647
  },
648
  {
649
- "epoch": 0.67,
650
- "grad_norm": 0.04839278716779415,
651
- "learning_rate": 5.8116401318728667e-05,
652
- "loss": 0.7969,
653
  "step": 455
654
  },
655
  {
656
- "epoch": 0.68,
657
- "grad_norm": 0.053679686497026036,
658
- "learning_rate": 5.578080177615575e-05,
659
- "loss": 0.7744,
660
  "step": 460
661
  },
662
  {
663
- "epoch": 0.69,
664
- "grad_norm": 0.04886589382975143,
665
- "learning_rate": 5.3474812955741404e-05,
666
- "loss": 0.782,
667
  "step": 465
668
  },
669
  {
670
- "epoch": 0.7,
671
- "grad_norm": 0.05872104081042249,
672
- "learning_rate": 5.119997902827584e-05,
673
- "loss": 0.7684,
674
  "step": 470
675
  },
676
  {
677
- "epoch": 0.7,
678
- "grad_norm": 0.04971188432483476,
679
- "learning_rate": 4.895782330214291e-05,
680
- "loss": 0.8219,
681
  "step": 475
682
  },
683
  {
684
- "epoch": 0.71,
685
- "grad_norm": 0.05137448628344103,
686
- "learning_rate": 4.674984720325961e-05,
687
- "loss": 0.7654,
688
  "step": 480
689
  },
690
  {
691
- "epoch": 0.72,
692
- "grad_norm": 0.04686654449943715,
693
- "learning_rate": 4.4577529269668874e-05,
694
- "loss": 0.7774,
695
  "step": 485
696
  },
697
  {
698
- "epoch": 0.73,
699
- "grad_norm": 0.055295260668893974,
700
- "learning_rate": 4.244232416145839e-05,
701
- "loss": 0.7245,
702
  "step": 490
703
  },
704
  {
705
- "epoch": 0.73,
706
- "grad_norm": 0.05389337215866635,
707
- "learning_rate": 4.0345661686669745e-05,
708
- "loss": 0.8061,
709
  "step": 495
710
  },
711
  {
712
- "epoch": 0.74,
713
- "grad_norm": 0.05870235717745641,
714
- "learning_rate": 3.828894584384867e-05,
715
- "loss": 0.8031,
716
  "step": 500
717
  },
718
  {
719
- "epoch": 0.75,
720
- "grad_norm": 0.05571889022670846,
721
- "learning_rate": 3.62735538818787e-05,
722
- "loss": 0.7614,
723
  "step": 505
724
  },
725
  {
726
- "epoch": 0.76,
727
- "grad_norm": 0.04492834518434723,
728
- "learning_rate": 3.43008353777269e-05,
729
- "loss": 0.7331,
730
  "step": 510
731
  },
732
  {
733
- "epoch": 0.76,
734
- "grad_norm": 0.05343462218042988,
735
- "learning_rate": 3.237211133272004e-05,
736
- "loss": 0.7355,
737
  "step": 515
738
  },
739
  {
740
- "epoch": 0.77,
741
- "grad_norm": 0.047988801277496954,
742
- "learning_rate": 3.0488673287955882e-05,
743
- "loss": 0.7237,
744
  "step": 520
745
  },
746
  {
747
- "epoch": 0.78,
748
- "grad_norm": 0.055886245680751935,
749
- "learning_rate": 2.8651782459442176e-05,
750
- "loss": 0.7426,
751
  "step": 525
752
  },
753
  {
754
- "epoch": 0.79,
755
- "grad_norm": 0.04696112028105608,
756
- "learning_rate": 2.686266889354211e-05,
757
- "loss": 0.7487,
758
  "step": 530
759
  },
760
  {
761
- "epoch": 0.79,
762
- "grad_norm": 0.04555764819319834,
763
- "learning_rate": 2.5122530643292275e-05,
764
- "loss": 0.7344,
765
  "step": 535
766
  },
767
  {
768
- "epoch": 0.8,
769
- "grad_norm": 0.05199303289710418,
770
- "learning_rate": 2.3432532966144527e-05,
771
- "loss": 0.7604,
772
  "step": 540
773
  },
774
  {
775
- "epoch": 0.81,
776
- "grad_norm": 0.05146492861699787,
777
- "learning_rate": 2.1793807543668853e-05,
778
- "loss": 0.7383,
779
  "step": 545
780
  },
781
  {
782
- "epoch": 0.81,
783
- "grad_norm": 0.05548374949115557,
784
- "learning_rate": 2.0207451723739633e-05,
785
- "loss": 0.7565,
786
  "step": 550
787
  },
788
  {
789
- "epoch": 0.82,
790
- "grad_norm": 0.04718878121287069,
791
- "learning_rate": 1.8674527785713247e-05,
792
- "loss": 0.7889,
793
  "step": 555
794
  },
795
  {
796
- "epoch": 0.83,
797
- "grad_norm": 0.07223231382050395,
798
- "learning_rate": 1.7196062229088604e-05,
799
- "loss": 0.7734,
800
  "step": 560
801
  },
802
  {
803
- "epoch": 0.84,
804
- "grad_norm": 0.051918878796507154,
805
- "learning_rate": 1.577304508612717e-05,
806
- "loss": 0.7697,
807
  "step": 565
808
  },
809
  {
810
- "epoch": 0.84,
811
- "grad_norm": 0.05143371773283759,
812
- "learning_rate": 1.4406429258892762e-05,
813
- "loss": 0.7591,
814
  "step": 570
815
  },
816
  {
817
- "epoch": 0.85,
818
- "grad_norm": 0.06242991163796485,
819
- "learning_rate": 1.3097129881154934e-05,
820
- "loss": 0.7888,
821
  "step": 575
822
  },
823
  {
824
- "epoch": 0.86,
825
- "grad_norm": 0.05577486269105434,
826
- "learning_rate": 1.1846023705583442e-05,
827
- "loss": 0.7503,
828
  "step": 580
829
  },
830
  {
831
- "epoch": 0.87,
832
- "grad_norm": 0.05386359623343792,
833
- "learning_rate": 1.065394851664394e-05,
834
- "loss": 0.7306,
835
  "step": 585
836
  },
837
  {
838
- "epoch": 0.87,
839
- "grad_norm": 0.06740139242925512,
840
- "learning_rate": 9.521702569588198e-06,
841
- "loss": 0.7748,
842
  "step": 590
843
  },
844
  {
845
- "epoch": 0.88,
846
- "grad_norm": 0.05952304608258396,
847
- "learning_rate": 8.450044055914497e-06,
848
- "loss": 0.6941,
849
  "step": 595
850
  },
851
  {
852
- "epoch": 0.89,
853
- "grad_norm": 0.06052761239891292,
854
- "learning_rate": 7.439690595656013e-06,
855
- "loss": 0.7775,
856
  "step": 600
857
  },
858
  {
859
- "epoch": 0.9,
860
- "grad_norm": 0.05646485799009386,
861
- "learning_rate": 6.4913187568374164e-06,
862
- "loss": 0.7941,
863
  "step": 605
864
  },
865
  {
866
- "epoch": 0.9,
867
- "grad_norm": 0.051627777900137006,
868
- "learning_rate": 5.605563602421149e-06,
869
- "loss": 0.7621,
870
  "step": 610
871
  },
872
  {
873
- "epoch": 0.91,
874
- "grad_norm": 0.05672370227304409,
875
- "learning_rate": 4.783018265047179e-06,
876
- "loss": 0.7598,
877
  "step": 615
878
  },
879
  {
880
- "epoch": 0.92,
881
- "grad_norm": 0.04806888785096822,
882
- "learning_rate": 4.024233549850509e-06,
883
- "loss": 0.7585,
884
  "step": 620
885
  },
886
  {
887
- "epoch": 0.93,
888
- "grad_norm": 0.054256686487143276,
889
- "learning_rate": 3.329717565622825e-06,
890
- "loss": 0.7766,
891
  "step": 625
892
  },
893
  {
894
- "epoch": 0.93,
895
- "grad_norm": 0.04081148671596208,
896
- "learning_rate": 2.699935384565111e-06,
897
- "loss": 0.7324,
898
  "step": 630
899
  },
900
  {
901
- "epoch": 0.94,
902
- "grad_norm": 0.052421185625722275,
903
- "learning_rate": 2.1353087308590314e-06,
904
- "loss": 0.7933,
905
  "step": 635
906
  },
907
  {
908
- "epoch": 0.95,
909
- "grad_norm": 0.05392813215656955,
910
- "learning_rate": 1.6362156982656084e-06,
911
- "loss": 0.7896,
912
  "step": 640
913
  },
914
  {
915
- "epoch": 0.96,
916
- "grad_norm": 0.052109378844003566,
917
- "learning_rate": 1.2029904969404482e-06,
918
- "loss": 0.7633,
919
  "step": 645
920
  },
921
  {
922
- "epoch": 0.96,
923
- "grad_norm": 0.05562712893622207,
924
- "learning_rate": 8.359232296349162e-07,
925
- "loss": 0.7664,
926
  "step": 650
927
  },
928
  {
929
- "epoch": 0.97,
930
- "grad_norm": 0.05328976304172556,
931
- "learning_rate": 5.352596974332436e-07,
932
- "loss": 0.7658,
933
  "step": 655
934
  },
935
  {
936
- "epoch": 0.98,
937
- "grad_norm": 0.05351883682294467,
938
- "learning_rate": 3.0120123515540164e-07,
939
- "loss": 0.7871,
940
  "step": 660
941
  },
942
  {
943
- "epoch": 0.99,
944
- "grad_norm": 0.05900811011725127,
945
- "learning_rate": 1.3390457653639222e-07,
946
- "loss": 0.7749,
947
  "step": 665
948
  },
949
  {
950
- "epoch": 0.99,
951
- "grad_norm": 0.06175235611664889,
952
- "learning_rate": 3.3481749271768726e-08,
953
- "loss": 0.7353,
954
  "step": 670
955
  },
956
  {
957
- "epoch": 1.0,
958
- "grad_norm": 0.05208882374001457,
959
- "learning_rate": 0.0,
960
- "loss": 0.7591,
961
  "step": 675
962
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
963
  {
964
  "epoch": 1.0,
965
  "eval_loss": NaN,
966
- "eval_runtime": 2998.9455,
967
- "eval_samples_per_second": 0.77,
968
- "eval_steps_per_second": 0.193,
969
- "step": 675
970
  },
971
  {
972
  "epoch": 1.0,
973
- "step": 675,
974
- "total_flos": 2.235287773328179e+16,
975
- "train_loss": 0.7647893634548893,
976
- "train_runtime": 21987.6737,
977
- "train_samples_per_second": 0.982,
978
- "train_steps_per_second": 0.031
979
  }
980
  ],
981
  "logging_steps": 5,
982
- "max_steps": 675,
983
  "num_input_tokens_seen": 0,
984
  "num_train_epochs": 1,
985
  "save_steps": 100,
986
- "total_flos": 2.235287773328179e+16,
987
  "train_batch_size": 4,
988
  "trial_name": null,
989
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9995189995189995,
5
  "eval_steps": 500,
6
+ "global_step": 1039,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
  "epoch": 0.0,
13
+ "grad_norm": 0.026559076829896673,
14
+ "learning_rate": 1.9230769230769234e-06,
15
+ "loss": 0.8553,
16
  "step": 1
17
  },
18
  {
19
+ "epoch": 0.0,
20
+ "grad_norm": 0.024109469955629563,
21
+ "learning_rate": 9.615384615384616e-06,
22
+ "loss": 0.8949,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.01,
27
+ "grad_norm": 0.035843143099576466,
28
+ "learning_rate": 1.923076923076923e-05,
29
+ "loss": 0.8487,
30
  "step": 10
31
  },
32
  {
33
+ "epoch": 0.01,
34
+ "grad_norm": 0.02927511973728809,
35
+ "learning_rate": 2.8846153846153845e-05,
36
+ "loss": 0.8298,
37
  "step": 15
38
  },
39
  {
40
+ "epoch": 0.02,
41
+ "grad_norm": 0.0482470347406101,
42
+ "learning_rate": 3.846153846153846e-05,
43
+ "loss": 0.8369,
44
  "step": 20
45
  },
46
  {
47
+ "epoch": 0.02,
48
+ "grad_norm": 0.04787111054382548,
49
+ "learning_rate": 4.8076923076923084e-05,
50
+ "loss": 0.8625,
51
  "step": 25
52
  },
53
  {
54
+ "epoch": 0.03,
55
+ "grad_norm": 0.09068040679248557,
56
+ "learning_rate": 5.769230769230769e-05,
57
+ "loss": 0.8133,
58
  "step": 30
59
  },
60
  {
61
+ "epoch": 0.03,
62
+ "grad_norm": 0.07412802798136442,
63
+ "learning_rate": 6.730769230769232e-05,
64
+ "loss": 0.8543,
65
  "step": 35
66
  },
67
  {
68
+ "epoch": 0.04,
69
+ "grad_norm": 0.0699997088327299,
70
+ "learning_rate": 7.692307692307693e-05,
71
+ "loss": 0.763,
72
  "step": 40
73
  },
74
  {
75
+ "epoch": 0.04,
76
+ "grad_norm": 0.05170267487139005,
77
+ "learning_rate": 8.653846153846155e-05,
78
+ "loss": 0.7827,
79
  "step": 45
80
  },
81
  {
82
+ "epoch": 0.05,
83
+ "grad_norm": 0.07357073672675946,
84
+ "learning_rate": 9.615384615384617e-05,
85
+ "loss": 0.8022,
86
  "step": 50
87
  },
88
  {
89
+ "epoch": 0.05,
90
+ "grad_norm": 0.05641137867224088,
91
+ "learning_rate": 0.00010576923076923077,
92
+ "loss": 0.7397,
93
  "step": 55
94
  },
95
  {
96
+ "epoch": 0.06,
97
+ "grad_norm": 0.057002882272420445,
98
+ "learning_rate": 0.00011538461538461538,
99
+ "loss": 0.777,
100
  "step": 60
101
  },
102
  {
103
+ "epoch": 0.06,
104
+ "grad_norm": 0.050844397685080686,
105
+ "learning_rate": 0.000125,
106
+ "loss": 0.7664,
107
  "step": 65
108
  },
109
  {
110
+ "epoch": 0.07,
111
+ "grad_norm": 0.05336016241159068,
112
+ "learning_rate": 0.00013461538461538464,
113
+ "loss": 0.7956,
114
  "step": 70
115
  },
116
  {
117
+ "epoch": 0.07,
118
+ "grad_norm": 0.05359842093910465,
119
+ "learning_rate": 0.00014423076923076924,
120
+ "loss": 0.751,
121
  "step": 75
122
  },
123
  {
124
+ "epoch": 0.08,
125
+ "grad_norm": 0.043971570409194735,
126
+ "learning_rate": 0.00015384615384615385,
127
+ "loss": 0.7823,
128
  "step": 80
129
  },
130
  {
131
+ "epoch": 0.08,
132
+ "grad_norm": 0.044563802592992065,
133
+ "learning_rate": 0.00016346153846153846,
134
+ "loss": 0.7908,
135
  "step": 85
136
  },
137
  {
138
+ "epoch": 0.09,
139
+ "grad_norm": 0.04801183812641932,
140
+ "learning_rate": 0.0001730769230769231,
141
+ "loss": 0.827,
142
  "step": 90
143
  },
144
  {
145
+ "epoch": 0.09,
146
+ "grad_norm": 0.0522380719803802,
147
+ "learning_rate": 0.0001826923076923077,
148
+ "loss": 0.8225,
149
  "step": 95
150
  },
151
  {
152
+ "epoch": 0.1,
153
+ "grad_norm": 0.048043288977650755,
154
+ "learning_rate": 0.00019230769230769233,
155
+ "loss": 0.7715,
156
  "step": 100
157
  },
158
  {
159
+ "epoch": 0.1,
160
+ "grad_norm": 0.04594631744893944,
161
+ "learning_rate": 0.00019999943552317104,
162
+ "loss": 0.7789,
163
  "step": 105
164
  },
165
  {
166
+ "epoch": 0.11,
167
+ "grad_norm": 0.050805778293244806,
168
+ "learning_rate": 0.00019997967950328128,
169
+ "loss": 0.8401,
170
  "step": 110
171
  },
172
  {
173
+ "epoch": 0.11,
174
+ "grad_norm": 0.04435482092479941,
175
+ "learning_rate": 0.0001999317060143023,
176
+ "loss": 0.7773,
177
  "step": 115
178
  },
179
  {
180
+ "epoch": 0.12,
181
+ "grad_norm": 0.05022795262704976,
182
+ "learning_rate": 0.0001998555285958899,
183
+ "loss": 0.7271,
184
  "step": 120
185
  },
186
  {
187
+ "epoch": 0.12,
188
+ "grad_norm": 0.05316130307831719,
189
+ "learning_rate": 0.00019975116874775242,
190
+ "loss": 0.8088,
191
  "step": 125
192
  },
193
  {
194
+ "epoch": 0.13,
195
+ "grad_norm": 0.04630310630357938,
196
+ "learning_rate": 0.00019961865592358288,
197
+ "loss": 0.7752,
198
  "step": 130
199
  },
200
  {
201
+ "epoch": 0.13,
202
+ "grad_norm": 0.05955477676937173,
203
+ "learning_rate": 0.0001994580275227462,
204
+ "loss": 0.7639,
205
  "step": 135
206
  },
207
  {
208
+ "epoch": 0.13,
209
+ "grad_norm": 0.04911523657827075,
210
+ "learning_rate": 0.00019926932887972393,
211
+ "loss": 0.7476,
212
  "step": 140
213
  },
214
  {
215
+ "epoch": 0.14,
216
+ "grad_norm": 0.04365854699386244,
217
+ "learning_rate": 0.0001990526132513194,
218
+ "loss": 0.7671,
219
  "step": 145
220
  },
221
  {
222
+ "epoch": 0.14,
223
+ "grad_norm": 0.0479590041051757,
224
+ "learning_rate": 0.00019880794180162693,
225
+ "loss": 0.8015,
226
  "step": 150
227
  },
228
  {
229
+ "epoch": 0.15,
230
+ "grad_norm": 0.053032592897760425,
231
+ "learning_rate": 0.00019853538358476932,
232
+ "loss": 0.8114,
233
  "step": 155
234
  },
235
  {
236
+ "epoch": 0.15,
237
+ "grad_norm": 0.04823282437342225,
238
+ "learning_rate": 0.00019823501552540865,
239
+ "loss": 0.7843,
240
  "step": 160
241
  },
242
  {
243
+ "epoch": 0.16,
244
+ "grad_norm": 0.13942036529350957,
245
+ "learning_rate": 0.00019790692239703557,
246
+ "loss": 0.7066,
247
  "step": 165
248
  },
249
  {
250
+ "epoch": 0.16,
251
+ "grad_norm": 0.05813669118246924,
252
+ "learning_rate": 0.00019755119679804367,
253
+ "loss": 0.7782,
254
  "step": 170
255
  },
256
  {
257
+ "epoch": 0.17,
258
+ "grad_norm": 0.045133845682998344,
259
+ "learning_rate": 0.00019716793912559507,
260
+ "loss": 0.8228,
261
  "step": 175
262
  },
263
  {
264
+ "epoch": 0.17,
265
+ "grad_norm": 0.04767176858103638,
266
+ "learning_rate": 0.00019675725754728527,
267
+ "loss": 0.8016,
268
  "step": 180
269
  },
270
  {
271
+ "epoch": 0.18,
272
+ "grad_norm": 0.05800072730359953,
273
+ "learning_rate": 0.00019631926797061456,
274
+ "loss": 0.7576,
275
  "step": 185
276
  },
277
  {
278
+ "epoch": 0.18,
279
+ "grad_norm": 0.044752361745923355,
280
+ "learning_rate": 0.00019585409401027556,
281
+ "loss": 0.7311,
282
  "step": 190
283
  },
284
  {
285
+ "epoch": 0.19,
286
+ "grad_norm": 0.0436163176892526,
287
+ "learning_rate": 0.00019536186695326486,
288
+ "loss": 0.7584,
289
  "step": 195
290
  },
291
  {
292
+ "epoch": 0.19,
293
+ "grad_norm": 0.07116796220713574,
294
+ "learning_rate": 0.00019484272572182986,
295
+ "loss": 0.7525,
296
  "step": 200
297
  },
298
  {
299
+ "epoch": 0.2,
300
+ "grad_norm": 0.051057018027582515,
301
+ "learning_rate": 0.00019429681683426022,
302
+ "loss": 0.798,
303
  "step": 205
304
  },
305
  {
306
+ "epoch": 0.2,
307
+ "grad_norm": 0.06621856781536847,
308
+ "learning_rate": 0.00019372429436353606,
309
+ "loss": 0.7242,
310
  "step": 210
311
  },
312
  {
313
+ "epoch": 0.21,
314
+ "grad_norm": 0.05280223873501623,
315
+ "learning_rate": 0.0001931253198938432,
316
+ "loss": 0.8013,
317
  "step": 215
318
  },
319
  {
320
+ "epoch": 0.21,
321
+ "grad_norm": 0.049913511617762696,
322
+ "learning_rate": 0.00019250006247496928,
323
+ "loss": 0.7282,
324
  "step": 220
325
  },
326
  {
327
+ "epoch": 0.22,
328
+ "grad_norm": 0.057224419119031235,
329
+ "learning_rate": 0.00019184869857459232,
330
+ "loss": 0.7986,
331
  "step": 225
332
  },
333
  {
334
+ "epoch": 0.22,
335
+ "grad_norm": 0.062107090513119266,
336
+ "learning_rate": 0.00019117141202847586,
337
+ "loss": 0.7305,
338
  "step": 230
339
  },
340
  {
341
+ "epoch": 0.23,
342
+ "grad_norm": 0.06236876490128247,
343
+ "learning_rate": 0.00019046839398858474,
344
+ "loss": 0.7961,
345
  "step": 235
346
  },
347
  {
348
+ "epoch": 0.23,
349
+ "grad_norm": 0.04936400763486868,
350
+ "learning_rate": 0.00018973984286913584,
351
+ "loss": 0.735,
352
  "step": 240
353
  },
354
  {
355
+ "epoch": 0.24,
356
+ "grad_norm": 0.053408247386716914,
357
+ "learning_rate": 0.0001889859642905992,
358
+ "loss": 0.7857,
359
  "step": 245
360
  },
361
  {
362
+ "epoch": 0.24,
363
+ "grad_norm": 0.05781811933339179,
364
+ "learning_rate": 0.00018820697102166526,
365
+ "loss": 0.7627,
366
  "step": 250
367
  },
368
  {
369
+ "epoch": 0.25,
370
+ "grad_norm": 0.05674627874967702,
371
+ "learning_rate": 0.00018740308291919497,
372
+ "loss": 0.7492,
373
  "step": 255
374
  },
375
  {
376
+ "epoch": 0.25,
377
+ "grad_norm": 0.05113375946042358,
378
+ "learning_rate": 0.0001865745268661689,
379
+ "loss": 0.8117,
380
  "step": 260
381
  },
382
  {
383
+ "epoch": 0.25,
384
+ "grad_norm": 0.06186000436202041,
385
+ "learning_rate": 0.00018572153670765365,
386
+ "loss": 0.801,
387
  "step": 265
388
  },
389
  {
390
+ "epoch": 0.26,
391
+ "grad_norm": 0.07562426320620609,
392
+ "learning_rate": 0.00018484435318480332,
393
+ "loss": 0.8071,
394
  "step": 270
395
  },
396
  {
397
+ "epoch": 0.26,
398
+ "grad_norm": 0.06311677584919179,
399
+ "learning_rate": 0.0001839432238669147,
400
+ "loss": 0.7843,
401
  "step": 275
402
  },
403
  {
404
+ "epoch": 0.27,
405
+ "grad_norm": 0.06630091865014885,
406
+ "learning_rate": 0.00018301840308155507,
407
+ "loss": 0.7493,
408
  "step": 280
409
  },
410
  {
411
+ "epoch": 0.27,
412
+ "grad_norm": 0.06544583135680962,
413
+ "learning_rate": 0.00018207015184278305,
414
+ "loss": 0.782,
415
  "step": 285
416
  },
417
  {
418
+ "epoch": 0.28,
419
+ "grad_norm": 0.06541978841910906,
420
+ "learning_rate": 0.000181098737777482,
421
+ "loss": 0.766,
422
  "step": 290
423
  },
424
  {
425
+ "epoch": 0.28,
426
+ "grad_norm": 0.05791188755333043,
427
+ "learning_rate": 0.00018010443504982694,
428
+ "loss": 0.7499,
429
  "step": 295
430
  },
431
  {
432
+ "epoch": 0.29,
433
+ "grad_norm": 0.06550833011506903,
434
+ "learning_rate": 0.000179087524283907,
435
+ "loss": 0.8137,
436
  "step": 300
437
  },
438
  {
439
+ "epoch": 0.29,
440
+ "grad_norm": 0.058622811505104906,
441
+ "learning_rate": 0.00017804829248452395,
442
+ "loss": 0.7512,
443
  "step": 305
444
  },
445
  {
446
+ "epoch": 0.3,
447
+ "grad_norm": 0.0646444641030475,
448
+ "learning_rate": 0.00017698703295619052,
449
+ "loss": 0.7908,
450
  "step": 310
451
  },
452
  {
453
+ "epoch": 0.3,
454
+ "grad_norm": 0.06559061358782307,
455
+ "learning_rate": 0.00017590404522035028,
456
+ "loss": 0.7308,
457
  "step": 315
458
  },
459
  {
460
+ "epoch": 0.31,
461
+ "grad_norm": 0.06308845309249086,
462
+ "learning_rate": 0.00017479963493084329,
463
+ "loss": 0.7643,
464
  "step": 320
465
  },
466
  {
467
+ "epoch": 0.31,
468
+ "grad_norm": 0.06406066039467145,
469
+ "learning_rate": 0.0001736741137876405,
470
+ "loss": 0.7775,
471
  "step": 325
472
  },
473
  {
474
+ "epoch": 0.32,
475
+ "grad_norm": 0.06444860497739553,
476
+ "learning_rate": 0.00017252779944887235,
477
+ "loss": 0.7774,
478
  "step": 330
479
  },
480
  {
481
+ "epoch": 0.32,
482
+ "grad_norm": 0.06604642339408012,
483
+ "learning_rate": 0.00017136101544117525,
484
+ "loss": 0.7362,
485
  "step": 335
486
  },
487
  {
488
+ "epoch": 0.33,
489
+ "grad_norm": 0.06850287335544007,
490
+ "learning_rate": 0.00017017409106838207,
491
+ "loss": 0.7501,
492
  "step": 340
493
  },
494
  {
495
+ "epoch": 0.33,
496
+ "grad_norm": 0.06459535279086648,
497
+ "learning_rate": 0.00016896736131858208,
498
+ "loss": 0.7606,
499
  "step": 345
500
  },
501
  {
502
+ "epoch": 0.34,
503
+ "grad_norm": 0.06245922169687922,
504
+ "learning_rate": 0.0001677411667695765,
505
+ "loss": 0.7459,
506
  "step": 350
507
  },
508
  {
509
+ "epoch": 0.34,
510
+ "grad_norm": 0.0618217626323294,
511
+ "learning_rate": 0.00016649585349275662,
512
+ "loss": 0.7608,
513
  "step": 355
514
  },
515
  {
516
+ "epoch": 0.35,
517
+ "grad_norm": 0.06529458738837608,
518
+ "learning_rate": 0.0001652317729554313,
519
+ "loss": 0.7793,
520
  "step": 360
521
  },
522
  {
523
+ "epoch": 0.35,
524
+ "grad_norm": 0.07281345190056655,
525
+ "learning_rate": 0.0001639492819216316,
526
+ "loss": 0.7769,
527
  "step": 365
528
  },
529
  {
530
+ "epoch": 0.36,
531
+ "grad_norm": 0.07253822403948054,
532
+ "learning_rate": 0.0001626487423514207,
533
+ "loss": 0.7699,
534
  "step": 370
535
  },
536
  {
537
+ "epoch": 0.36,
538
+ "grad_norm": 0.059915171172645505,
539
+ "learning_rate": 0.00016133052129873693,
540
+ "loss": 0.7426,
541
  "step": 375
542
  },
543
  {
544
+ "epoch": 0.37,
545
+ "grad_norm": 0.06063815927279327,
546
+ "learning_rate": 0.0001599949908077996,
547
+ "loss": 0.7859,
548
  "step": 380
549
  },
550
  {
551
+ "epoch": 0.37,
552
+ "grad_norm": 0.07982151033712452,
553
+ "learning_rate": 0.00015864252780810616,
554
+ "loss": 0.7484,
555
  "step": 385
556
  },
557
  {
558
+ "epoch": 0.38,
559
+ "grad_norm": 0.07807371830206032,
560
+ "learning_rate": 0.00015727351400805052,
561
+ "loss": 0.7318,
562
  "step": 390
563
  },
564
  {
565
+ "epoch": 0.38,
566
+ "grad_norm": 0.06359423217230728,
567
+ "learning_rate": 0.0001558883357871928,
568
+ "loss": 0.7707,
569
  "step": 395
570
  },
571
  {
572
+ "epoch": 0.38,
573
+ "grad_norm": 0.09384048396706658,
574
+ "learning_rate": 0.00015448738408721052,
575
+ "loss": 0.7869,
576
  "step": 400
577
  },
578
  {
579
+ "epoch": 0.39,
580
+ "grad_norm": 0.07059181179022768,
581
+ "learning_rate": 0.00015307105430156255,
582
+ "loss": 0.7139,
583
  "step": 405
584
  },
585
  {
586
+ "epoch": 0.39,
587
+ "grad_norm": 0.07407320984033573,
588
+ "learning_rate": 0.0001516397461638962,
589
+ "loss": 0.7476,
590
  "step": 410
591
  },
592
  {
593
+ "epoch": 0.4,
594
+ "grad_norm": 0.07240911601504253,
595
+ "learning_rate": 0.0001501938636352297,
596
+ "loss": 0.7655,
597
  "step": 415
598
  },
599
  {
600
+ "epoch": 0.4,
601
+ "grad_norm": 0.07068099666527985,
602
+ "learning_rate": 0.00014873381478994134,
603
+ "loss": 0.7893,
604
  "step": 420
605
  },
606
  {
607
+ "epoch": 0.41,
608
+ "grad_norm": 0.08305359605908401,
609
+ "learning_rate": 0.00014726001170059792,
610
+ "loss": 0.7111,
611
  "step": 425
612
  },
613
  {
614
+ "epoch": 0.41,
615
+ "grad_norm": 0.0670085750282625,
616
+ "learning_rate": 0.00014577287032165468,
617
+ "loss": 0.7527,
618
  "step": 430
619
  },
620
  {
621
+ "epoch": 0.42,
622
+ "grad_norm": 0.06789518226534799,
623
+ "learning_rate": 0.00014427281037205945,
624
+ "loss": 0.7751,
625
  "step": 435
626
  },
627
  {
628
+ "epoch": 0.42,
629
+ "grad_norm": 0.09136108168814464,
630
+ "learning_rate": 0.00014276025521679471,
631
+ "loss": 0.726,
632
  "step": 440
633
  },
634
  {
635
+ "epoch": 0.43,
636
+ "grad_norm": 0.06956344474331025,
637
+ "learning_rate": 0.00014123563174739037,
638
+ "loss": 0.8187,
639
  "step": 445
640
  },
641
  {
642
+ "epoch": 0.43,
643
+ "grad_norm": 0.07271098739476833,
644
+ "learning_rate": 0.00013969937026144118,
645
+ "loss": 0.7787,
646
  "step": 450
647
  },
648
  {
649
+ "epoch": 0.44,
650
+ "grad_norm": 0.06984149301611005,
651
+ "learning_rate": 0.00013815190434116317,
652
+ "loss": 0.7873,
653
  "step": 455
654
  },
655
  {
656
+ "epoch": 0.44,
657
+ "grad_norm": 0.07335035453389814,
658
+ "learning_rate": 0.00013659367073102268,
659
+ "loss": 0.7609,
660
  "step": 460
661
  },
662
  {
663
+ "epoch": 0.45,
664
+ "grad_norm": 0.0744105188121296,
665
+ "learning_rate": 0.00013502510921447323,
666
+ "loss": 0.7169,
667
  "step": 465
668
  },
669
  {
670
+ "epoch": 0.45,
671
+ "grad_norm": 0.06947572989302908,
672
+ "learning_rate": 0.00013344666248983432,
673
+ "loss": 0.7837,
674
  "step": 470
675
  },
676
  {
677
+ "epoch": 0.46,
678
+ "grad_norm": 0.07999096873197906,
679
+ "learning_rate": 0.000131858776045348,
680
+ "loss": 0.7727,
681
  "step": 475
682
  },
683
  {
684
+ "epoch": 0.46,
685
+ "grad_norm": 0.07774452098436961,
686
+ "learning_rate": 0.00013026189803344774,
687
+ "loss": 0.8242,
688
  "step": 480
689
  },
690
  {
691
+ "epoch": 0.47,
692
+ "grad_norm": 0.07801636818427328,
693
+ "learning_rate": 0.00012865647914427544,
694
+ "loss": 0.7269,
695
  "step": 485
696
  },
697
  {
698
+ "epoch": 0.47,
699
+ "grad_norm": 0.11086472292549841,
700
+ "learning_rate": 0.00012704297247848216,
701
+ "loss": 0.7503,
702
  "step": 490
703
  },
704
  {
705
+ "epoch": 0.48,
706
+ "grad_norm": 0.0724703886664607,
707
+ "learning_rate": 0.00012542183341934872,
708
+ "loss": 0.81,
709
  "step": 495
710
  },
711
  {
712
+ "epoch": 0.48,
713
+ "grad_norm": 0.08424198871373768,
714
+ "learning_rate": 0.00012379351950426187,
715
+ "loss": 0.7102,
716
  "step": 500
717
  },
718
  {
719
+ "epoch": 0.49,
720
+ "grad_norm": 0.07399506224237913,
721
+ "learning_rate": 0.0001221584902955827,
722
+ "loss": 0.811,
723
  "step": 505
724
  },
725
  {
726
+ "epoch": 0.49,
727
+ "grad_norm": 0.07513530266538687,
728
+ "learning_rate": 0.00012051720725094324,
729
+ "loss": 0.7328,
730
  "step": 510
731
  },
732
  {
733
+ "epoch": 0.5,
734
+ "grad_norm": 0.08300665590930759,
735
+ "learning_rate": 0.00011887013359300837,
736
+ "loss": 0.7728,
737
  "step": 515
738
  },
739
  {
740
+ "epoch": 0.5,
741
+ "grad_norm": 0.08603160091015216,
742
+ "learning_rate": 0.00011721773417873965,
743
+ "loss": 0.8092,
744
  "step": 520
745
  },
746
  {
747
+ "epoch": 0.51,
748
+ "grad_norm": 0.06828684892985817,
749
+ "learning_rate": 0.00011556047536819777,
750
+ "loss": 0.7905,
751
  "step": 525
752
  },
753
  {
754
+ "epoch": 0.51,
755
+ "grad_norm": 0.0888375409363043,
756
+ "learning_rate": 0.00011389882489292061,
757
+ "loss": 0.7616,
758
  "step": 530
759
  },
760
  {
761
+ "epoch": 0.51,
762
+ "grad_norm": 0.07153992771335722,
763
+ "learning_rate": 0.0001122332517239147,
764
+ "loss": 0.7231,
765
  "step": 535
766
  },
767
  {
768
+ "epoch": 0.52,
769
+ "grad_norm": 0.07078385208785225,
770
+ "learning_rate": 0.00011056422593929635,
771
+ "loss": 0.7744,
772
  "step": 540
773
  },
774
  {
775
+ "epoch": 0.52,
776
+ "grad_norm": 0.06724179763601197,
777
+ "learning_rate": 0.00010889221859162062,
778
+ "loss": 0.7385,
779
  "step": 545
780
  },
781
  {
782
+ "epoch": 0.53,
783
+ "grad_norm": 0.08127731002438206,
784
+ "learning_rate": 0.00010721770157493527,
785
+ "loss": 0.737,
786
  "step": 550
787
  },
788
  {
789
+ "epoch": 0.53,
790
+ "grad_norm": 0.07971841929977522,
791
+ "learning_rate": 0.000105541147491597,
792
+ "loss": 0.7129,
793
  "step": 555
794
  },
795
  {
796
+ "epoch": 0.54,
797
+ "grad_norm": 0.09905140750790473,
798
+ "learning_rate": 0.00010386302951888804,
799
+ "loss": 0.7682,
800
  "step": 560
801
  },
802
  {
803
+ "epoch": 0.54,
804
+ "grad_norm": 0.07787324370915785,
805
+ "learning_rate": 0.00010218382127547022,
806
+ "loss": 0.7988,
807
  "step": 565
808
  },
809
  {
810
+ "epoch": 0.55,
811
+ "grad_norm": 0.06979898484314451,
812
+ "learning_rate": 0.00010050399668771479,
813
+ "loss": 0.7505,
814
  "step": 570
815
  },
816
  {
817
+ "epoch": 0.55,
818
+ "grad_norm": 0.08294483662862129,
819
+ "learning_rate": 9.882402985594515e-05,
820
+ "loss": 0.7254,
821
  "step": 575
822
  },
823
  {
824
+ "epoch": 0.56,
825
+ "grad_norm": 0.09404505448514887,
826
+ "learning_rate": 9.71443949206304e-05,
827
+ "loss": 0.7744,
828
  "step": 580
829
  },
830
  {
831
+ "epoch": 0.56,
832
+ "grad_norm": 0.08296480696219607,
833
+ "learning_rate": 9.546556592856789e-05,
834
+ "loss": 0.7255,
835
  "step": 585
836
  },
837
  {
838
+ "epoch": 0.57,
839
+ "grad_norm": 0.08380203772972816,
840
+ "learning_rate": 9.378801669909197e-05,
841
+ "loss": 0.6704,
842
  "step": 590
843
  },
844
  {
845
+ "epoch": 0.57,
846
+ "grad_norm": 0.09814150545542286,
847
+ "learning_rate": 9.211222069034695e-05,
848
+ "loss": 0.7107,
849
  "step": 595
850
  },
851
  {
852
+ "epoch": 0.58,
853
+ "grad_norm": 0.0951778654867295,
854
+ "learning_rate": 9.043865086566214e-05,
855
+ "loss": 0.7158,
856
  "step": 600
857
  },
858
  {
859
+ "epoch": 0.58,
860
+ "grad_norm": 0.08511949253631121,
861
+ "learning_rate": 8.87677795600663e-05,
862
+ "loss": 0.7572,
863
  "step": 605
864
  },
865
  {
866
+ "epoch": 0.59,
867
+ "grad_norm": 0.1143213911299264,
868
+ "learning_rate": 8.710007834697969e-05,
869
+ "loss": 0.7785,
870
  "step": 610
871
  },
872
  {
873
+ "epoch": 0.59,
874
+ "grad_norm": 0.08008142575287244,
875
+ "learning_rate": 8.543601790512083e-05,
876
+ "loss": 0.7327,
877
  "step": 615
878
  },
879
  {
880
+ "epoch": 0.6,
881
+ "grad_norm": 0.09526058947958355,
882
+ "learning_rate": 8.377606788566597e-05,
883
+ "loss": 0.703,
884
  "step": 620
885
  },
886
  {
887
+ "epoch": 0.6,
888
+ "grad_norm": 0.08273005979279956,
889
+ "learning_rate": 8.212069677969851e-05,
890
+ "loss": 0.7497,
891
  "step": 625
892
  },
893
  {
894
+ "epoch": 0.61,
895
+ "grad_norm": 0.08864939979402765,
896
+ "learning_rate": 8.047037178598567e-05,
897
+ "loss": 0.7573,
898
  "step": 630
899
  },
900
  {
901
+ "epoch": 0.61,
902
+ "grad_norm": 0.08394557070047488,
903
+ "learning_rate": 7.882555867912017e-05,
904
+ "loss": 0.7827,
905
  "step": 635
906
  },
907
  {
908
+ "epoch": 0.62,
909
+ "grad_norm": 0.09978852942456092,
910
+ "learning_rate": 7.718672167806354e-05,
911
+ "loss": 0.7201,
912
  "step": 640
913
  },
914
  {
915
+ "epoch": 0.62,
916
+ "grad_norm": 0.07987549874350373,
917
+ "learning_rate": 7.55543233151289e-05,
918
+ "loss": 0.7129,
919
  "step": 645
920
  },
921
  {
922
+ "epoch": 0.63,
923
+ "grad_norm": 0.09662339979538469,
924
+ "learning_rate": 7.392882430543928e-05,
925
+ "loss": 0.7593,
926
  "step": 650
927
  },
928
  {
929
+ "epoch": 0.63,
930
+ "grad_norm": 0.0966150953702372,
931
+ "learning_rate": 7.231068341689923e-05,
932
+ "loss": 0.6704,
933
  "step": 655
934
  },
935
  {
936
+ "epoch": 0.63,
937
+ "grad_norm": 0.08961534426031717,
938
+ "learning_rate": 7.070035734071574e-05,
939
+ "loss": 0.781,
940
  "step": 660
941
  },
942
  {
943
+ "epoch": 0.64,
944
+ "grad_norm": 0.09684345543910731,
945
+ "learning_rate": 6.909830056250527e-05,
946
+ "loss": 0.7787,
947
  "step": 665
948
  },
949
  {
950
+ "epoch": 0.64,
951
+ "grad_norm": 0.10518057650982461,
952
+ "learning_rate": 6.750496523402352e-05,
953
+ "loss": 0.7658,
954
  "step": 670
955
  },
956
  {
957
+ "epoch": 0.65,
958
+ "grad_norm": 0.08726496496105966,
959
+ "learning_rate": 6.592080104555357e-05,
960
+ "loss": 0.7515,
961
  "step": 675
962
  },
963
+ {
964
+ "epoch": 0.65,
965
+ "grad_norm": 0.0933748204446253,
966
+ "learning_rate": 6.434625509898897e-05,
967
+ "loss": 0.7474,
968
+ "step": 680
969
+ },
970
+ {
971
+ "epoch": 0.66,
972
+ "grad_norm": 0.09987049741300834,
973
+ "learning_rate": 6.278177178164721e-05,
974
+ "loss": 0.7458,
975
+ "step": 685
976
+ },
977
+ {
978
+ "epoch": 0.66,
979
+ "grad_norm": 0.10265280285088377,
980
+ "learning_rate": 6.122779264084932e-05,
981
+ "loss": 0.7194,
982
+ "step": 690
983
+ },
984
+ {
985
+ "epoch": 0.67,
986
+ "grad_norm": 0.09519605624873385,
987
+ "learning_rate": 5.968475625930124e-05,
988
+ "loss": 0.7788,
989
+ "step": 695
990
+ },
991
+ {
992
+ "epoch": 0.67,
993
+ "grad_norm": 0.10432725726790697,
994
+ "learning_rate": 5.815309813131153e-05,
995
+ "loss": 0.6987,
996
+ "step": 700
997
+ },
998
+ {
999
+ "epoch": 0.68,
1000
+ "grad_norm": 0.0895838664526722,
1001
+ "learning_rate": 5.663325053988112e-05,
1002
+ "loss": 0.7438,
1003
+ "step": 705
1004
+ },
1005
+ {
1006
+ "epoch": 0.68,
1007
+ "grad_norm": 0.1020683075125265,
1008
+ "learning_rate": 5.5125642434699044e-05,
1009
+ "loss": 0.7329,
1010
+ "step": 710
1011
+ },
1012
+ {
1013
+ "epoch": 0.69,
1014
+ "grad_norm": 0.1254242358800245,
1015
+ "learning_rate": 5.363069931107902e-05,
1016
+ "loss": 0.7701,
1017
+ "step": 715
1018
+ },
1019
+ {
1020
+ "epoch": 0.69,
1021
+ "grad_norm": 0.09171973242978361,
1022
+ "learning_rate": 5.214884308987136e-05,
1023
+ "loss": 0.7614,
1024
+ "step": 720
1025
+ },
1026
+ {
1027
+ "epoch": 0.7,
1028
+ "grad_norm": 0.0973938129022939,
1029
+ "learning_rate": 5.068049199838307e-05,
1030
+ "loss": 0.7654,
1031
+ "step": 725
1032
+ },
1033
+ {
1034
+ "epoch": 0.7,
1035
+ "grad_norm": 0.10651861956923717,
1036
+ "learning_rate": 4.9226060452340825e-05,
1037
+ "loss": 0.7459,
1038
+ "step": 730
1039
+ },
1040
+ {
1041
+ "epoch": 0.71,
1042
+ "grad_norm": 0.10089952208386434,
1043
+ "learning_rate": 4.7785958938929644e-05,
1044
+ "loss": 0.7259,
1045
+ "step": 735
1046
+ },
1047
+ {
1048
+ "epoch": 0.71,
1049
+ "grad_norm": 0.1210514333134373,
1050
+ "learning_rate": 4.6360593900940074e-05,
1051
+ "loss": 0.7434,
1052
+ "step": 740
1053
+ },
1054
+ {
1055
+ "epoch": 0.72,
1056
+ "grad_norm": 0.09318485524858554,
1057
+ "learning_rate": 4.4950367622057173e-05,
1058
+ "loss": 0.7452,
1059
+ "step": 745
1060
+ },
1061
+ {
1062
+ "epoch": 0.72,
1063
+ "grad_norm": 0.0982653281602369,
1064
+ "learning_rate": 4.355567811332311e-05,
1065
+ "loss": 0.7647,
1066
+ "step": 750
1067
+ },
1068
+ {
1069
+ "epoch": 0.73,
1070
+ "grad_norm": 0.10433832624807006,
1071
+ "learning_rate": 4.21769190008056e-05,
1072
+ "loss": 0.7786,
1073
+ "step": 755
1074
+ },
1075
+ {
1076
+ "epoch": 0.73,
1077
+ "grad_norm": 0.10945535727159489,
1078
+ "learning_rate": 4.081447941450428e-05,
1079
+ "loss": 0.7534,
1080
+ "step": 760
1081
+ },
1082
+ {
1083
+ "epoch": 0.74,
1084
+ "grad_norm": 0.08979908942148239,
1085
+ "learning_rate": 3.946874387852545e-05,
1086
+ "loss": 0.7684,
1087
+ "step": 765
1088
+ },
1089
+ {
1090
+ "epoch": 0.74,
1091
+ "grad_norm": 0.10406027811162083,
1092
+ "learning_rate": 3.8140092202557185e-05,
1093
+ "loss": 0.722,
1094
+ "step": 770
1095
+ },
1096
+ {
1097
+ "epoch": 0.75,
1098
+ "grad_norm": 0.1035268479061034,
1099
+ "learning_rate": 3.682889937467493e-05,
1100
+ "loss": 0.7553,
1101
+ "step": 775
1102
+ },
1103
+ {
1104
+ "epoch": 0.75,
1105
+ "grad_norm": 0.10887919806436043,
1106
+ "learning_rate": 3.553553545550768e-05,
1107
+ "loss": 0.7246,
1108
+ "step": 780
1109
+ },
1110
+ {
1111
+ "epoch": 0.76,
1112
+ "grad_norm": 0.09604076773803726,
1113
+ "learning_rate": 3.426036547379528e-05,
1114
+ "loss": 0.7608,
1115
+ "step": 785
1116
+ },
1117
+ {
1118
+ "epoch": 0.76,
1119
+ "grad_norm": 0.10956385216460547,
1120
+ "learning_rate": 3.300374932336533e-05,
1121
+ "loss": 0.7338,
1122
+ "step": 790
1123
+ },
1124
+ {
1125
+ "epoch": 0.76,
1126
+ "grad_norm": 0.10172528555623694,
1127
+ "learning_rate": 3.176604166155976e-05,
1128
+ "loss": 0.7495,
1129
+ "step": 795
1130
+ },
1131
+ {
1132
+ "epoch": 0.77,
1133
+ "grad_norm": 0.10744826094269415,
1134
+ "learning_rate": 3.054759180913921e-05,
1135
+ "loss": 0.8015,
1136
+ "step": 800
1137
+ },
1138
+ {
1139
+ "epoch": 0.77,
1140
+ "grad_norm": 0.09531428953628962,
1141
+ "learning_rate": 2.9348743651693357e-05,
1142
+ "loss": 0.7432,
1143
+ "step": 805
1144
+ },
1145
+ {
1146
+ "epoch": 0.78,
1147
+ "grad_norm": 0.09578814140538712,
1148
+ "learning_rate": 2.8169835542585587e-05,
1149
+ "loss": 0.6876,
1150
+ "step": 810
1151
+ },
1152
+ {
1153
+ "epoch": 0.78,
1154
+ "grad_norm": 0.10906430952489729,
1155
+ "learning_rate": 2.7011200207458677e-05,
1156
+ "loss": 0.7461,
1157
+ "step": 815
1158
+ },
1159
+ {
1160
+ "epoch": 0.79,
1161
+ "grad_norm": 0.09924057812689555,
1162
+ "learning_rate": 2.5873164650328996e-05,
1163
+ "loss": 0.7403,
1164
+ "step": 820
1165
+ },
1166
+ {
1167
+ "epoch": 0.79,
1168
+ "grad_norm": 0.102534066373728,
1169
+ "learning_rate": 2.4756050061295534e-05,
1170
+ "loss": 0.7771,
1171
+ "step": 825
1172
+ },
1173
+ {
1174
+ "epoch": 0.8,
1175
+ "grad_norm": 0.11379854278214475,
1176
+ "learning_rate": 2.36601717258897e-05,
1177
+ "loss": 0.7494,
1178
+ "step": 830
1179
+ },
1180
+ {
1181
+ "epoch": 0.8,
1182
+ "grad_norm": 0.09555943569834097,
1183
+ "learning_rate": 2.2585838936091754e-05,
1184
+ "loss": 0.7062,
1185
+ "step": 835
1186
+ },
1187
+ {
1188
+ "epoch": 0.81,
1189
+ "grad_norm": 0.08701341120074446,
1190
+ "learning_rate": 2.153335490303856e-05,
1191
+ "loss": 0.7029,
1192
+ "step": 840
1193
+ },
1194
+ {
1195
+ "epoch": 0.81,
1196
+ "grad_norm": 0.10955955931583401,
1197
+ "learning_rate": 2.0503016671447785e-05,
1198
+ "loss": 0.7119,
1199
+ "step": 845
1200
+ },
1201
+ {
1202
+ "epoch": 0.82,
1203
+ "grad_norm": 0.09961678142736638,
1204
+ "learning_rate": 1.9495115035782307e-05,
1205
+ "loss": 0.7181,
1206
+ "step": 850
1207
+ },
1208
+ {
1209
+ "epoch": 0.82,
1210
+ "grad_norm": 0.10875433120436596,
1211
+ "learning_rate": 1.8509934458178712e-05,
1212
+ "loss": 0.7221,
1213
+ "step": 855
1214
+ },
1215
+ {
1216
+ "epoch": 0.83,
1217
+ "grad_norm": 0.10419310033343766,
1218
+ "learning_rate": 1.754775298816307e-05,
1219
+ "loss": 0.7627,
1220
+ "step": 860
1221
+ },
1222
+ {
1223
+ "epoch": 0.83,
1224
+ "grad_norm": 0.11125643674385945,
1225
+ "learning_rate": 1.6608842184176243e-05,
1226
+ "loss": 0.783,
1227
+ "step": 865
1228
+ },
1229
+ {
1230
+ "epoch": 0.84,
1231
+ "grad_norm": 0.09805407303806633,
1232
+ "learning_rate": 1.5693467036931576e-05,
1233
+ "loss": 0.7754,
1234
+ "step": 870
1235
+ },
1236
+ {
1237
+ "epoch": 0.84,
1238
+ "grad_norm": 0.09382093887295272,
1239
+ "learning_rate": 1.48018858946259e-05,
1240
+ "loss": 0.7306,
1241
+ "step": 875
1242
+ },
1243
+ {
1244
+ "epoch": 0.85,
1245
+ "grad_norm": 0.08828049016039903,
1246
+ "learning_rate": 1.3934350390025463e-05,
1247
+ "loss": 0.7277,
1248
+ "step": 880
1249
+ },
1250
+ {
1251
+ "epoch": 0.85,
1252
+ "grad_norm": 0.10111049793555948,
1253
+ "learning_rate": 1.3091105369447165e-05,
1254
+ "loss": 0.7433,
1255
+ "step": 885
1256
+ },
1257
+ {
1258
+ "epoch": 0.86,
1259
+ "grad_norm": 0.09959284476126003,
1260
+ "learning_rate": 1.22723888236549e-05,
1261
+ "loss": 0.7608,
1262
+ "step": 890
1263
+ },
1264
+ {
1265
+ "epoch": 0.86,
1266
+ "grad_norm": 0.12000298215227106,
1267
+ "learning_rate": 1.1478431820691083e-05,
1268
+ "loss": 0.7249,
1269
+ "step": 895
1270
+ },
1271
+ {
1272
+ "epoch": 0.87,
1273
+ "grad_norm": 0.09846825711770221,
1274
+ "learning_rate": 1.0709458440661801e-05,
1275
+ "loss": 0.7474,
1276
+ "step": 900
1277
+ },
1278
+ {
1279
+ "epoch": 0.87,
1280
+ "grad_norm": 0.08826572755281627,
1281
+ "learning_rate": 9.965685712494199e-06,
1282
+ "loss": 0.7125,
1283
+ "step": 905
1284
+ },
1285
+ {
1286
+ "epoch": 0.88,
1287
+ "grad_norm": 0.12402329261776916,
1288
+ "learning_rate": 9.247323552684051e-06,
1289
+ "loss": 0.7685,
1290
+ "step": 910
1291
+ },
1292
+ {
1293
+ "epoch": 0.88,
1294
+ "grad_norm": 0.10462400264867344,
1295
+ "learning_rate": 8.554574706050488e-06,
1296
+ "loss": 0.7884,
1297
+ "step": 915
1298
+ },
1299
+ {
1300
+ "epoch": 0.89,
1301
+ "grad_norm": 0.11275389812311376,
1302
+ "learning_rate": 7.887634688515e-06,
1303
+ "loss": 0.7487,
1304
+ "step": 920
1305
+ },
1306
+ {
1307
+ "epoch": 0.89,
1308
+ "grad_norm": 0.11871268386808256,
1309
+ "learning_rate": 7.246691731920485e-06,
1310
+ "loss": 0.7607,
1311
+ "step": 925
1312
+ },
1313
+ {
1314
+ "epoch": 0.89,
1315
+ "grad_norm": 0.12345475324952107,
1316
+ "learning_rate": 6.631926730906324e-06,
1317
+ "loss": 0.7716,
1318
+ "step": 930
1319
+ },
1320
+ {
1321
+ "epoch": 0.9,
1322
+ "grad_norm": 0.10653833929745236,
1323
+ "learning_rate": 6.043513191853978e-06,
1324
+ "loss": 0.7465,
1325
+ "step": 935
1326
+ },
1327
+ {
1328
+ "epoch": 0.9,
1329
+ "grad_norm": 0.09013425987778603,
1330
+ "learning_rate": 5.481617183918053e-06,
1331
+ "loss": 0.7543,
1332
+ "step": 940
1333
+ },
1334
+ {
1335
+ "epoch": 0.91,
1336
+ "grad_norm": 0.09465293194517142,
1337
+ "learning_rate": 4.946397292156158e-06,
1338
+ "loss": 0.736,
1339
+ "step": 945
1340
+ },
1341
+ {
1342
+ "epoch": 0.91,
1343
+ "grad_norm": 0.10787353868301028,
1344
+ "learning_rate": 4.438004572771182e-06,
1345
+ "loss": 0.7284,
1346
+ "step": 950
1347
+ },
1348
+ {
1349
+ "epoch": 0.92,
1350
+ "grad_norm": 0.1429868308987827,
1351
+ "learning_rate": 3.9565825104783685e-06,
1352
+ "loss": 0.7907,
1353
+ "step": 955
1354
+ },
1355
+ {
1356
+ "epoch": 0.92,
1357
+ "grad_norm": 0.10249198050493027,
1358
+ "learning_rate": 3.5022669780093497e-06,
1359
+ "loss": 0.7203,
1360
+ "step": 960
1361
+ },
1362
+ {
1363
+ "epoch": 0.93,
1364
+ "grad_norm": 0.1028361192764587,
1365
+ "learning_rate": 3.0751861977645125e-06,
1366
+ "loss": 0.7284,
1367
+ "step": 965
1368
+ },
1369
+ {
1370
+ "epoch": 0.93,
1371
+ "grad_norm": 0.1278500182671187,
1372
+ "learning_rate": 2.6754607056244883e-06,
1373
+ "loss": 0.7447,
1374
+ "step": 970
1375
+ },
1376
+ {
1377
+ "epoch": 0.94,
1378
+ "grad_norm": 0.09199143538365835,
1379
+ "learning_rate": 2.303203316931102e-06,
1380
+ "loss": 0.7173,
1381
+ "step": 975
1382
+ },
1383
+ {
1384
+ "epoch": 0.94,
1385
+ "grad_norm": 0.10797760289216082,
1386
+ "learning_rate": 1.9585190946472488e-06,
1387
+ "loss": 0.7163,
1388
+ "step": 980
1389
+ },
1390
+ {
1391
+ "epoch": 0.95,
1392
+ "grad_norm": 0.09281102451549036,
1393
+ "learning_rate": 1.6415053197047725e-06,
1394
+ "loss": 0.7284,
1395
+ "step": 985
1396
+ },
1397
+ {
1398
+ "epoch": 0.95,
1399
+ "grad_norm": 0.11431066643386725,
1400
+ "learning_rate": 1.3522514635486816e-06,
1401
+ "loss": 0.7723,
1402
+ "step": 990
1403
+ },
1404
+ {
1405
+ "epoch": 0.96,
1406
+ "grad_norm": 0.09277614207193054,
1407
+ "learning_rate": 1.0908391628854041e-06,
1408
+ "loss": 0.7623,
1409
+ "step": 995
1410
+ },
1411
+ {
1412
+ "epoch": 0.96,
1413
+ "grad_norm": 0.0970512860952287,
1414
+ "learning_rate": 8.57342196642319e-07,
1415
+ "loss": 0.6736,
1416
+ "step": 1000
1417
+ },
1418
+ {
1419
+ "epoch": 0.97,
1420
+ "grad_norm": 0.10404789525698893,
1421
+ "learning_rate": 6.518264651449779e-07,
1422
+ "loss": 0.7771,
1423
+ "step": 1005
1424
+ },
1425
+ {
1426
+ "epoch": 0.97,
1427
+ "grad_norm": 0.09905350029610553,
1428
+ "learning_rate": 4.743499715179067e-07,
1429
+ "loss": 0.7495,
1430
+ "step": 1010
1431
+ },
1432
+ {
1433
+ "epoch": 0.98,
1434
+ "grad_norm": 0.08940324837180028,
1435
+ "learning_rate": 3.249628053142884e-07,
1436
+ "loss": 0.7587,
1437
+ "step": 1015
1438
+ },
1439
+ {
1440
+ "epoch": 0.98,
1441
+ "grad_norm": 0.10668411274319571,
1442
+ "learning_rate": 2.0370712837906037e-07,
1443
+ "loss": 0.7657,
1444
+ "step": 1020
1445
+ },
1446
+ {
1447
+ "epoch": 0.99,
1448
+ "grad_norm": 0.0933938060668371,
1449
+ "learning_rate": 1.1061716294951118e-07,
1450
+ "loss": 0.7493,
1451
+ "step": 1025
1452
+ },
1453
+ {
1454
+ "epoch": 0.99,
1455
+ "grad_norm": 0.09579015452060988,
1456
+ "learning_rate": 4.5719181996650705e-08,
1457
+ "loss": 0.7677,
1458
+ "step": 1030
1459
+ },
1460
+ {
1461
+ "epoch": 1.0,
1462
+ "grad_norm": 0.1038186774195005,
1463
+ "learning_rate": 9.031501810174981e-09,
1464
+ "loss": 0.7327,
1465
+ "step": 1035
1466
+ },
1467
  {
1468
  "epoch": 1.0,
1469
  "eval_loss": NaN,
1470
+ "eval_runtime": 3002.6712,
1471
+ "eval_samples_per_second": 0.769,
1472
+ "eval_steps_per_second": 0.192,
1473
+ "step": 1039
1474
  },
1475
  {
1476
  "epoch": 1.0,
1477
+ "step": 1039,
1478
+ "total_flos": 3.479777588753203e+16,
1479
+ "train_loss": 0.7603742487735766,
1480
+ "train_runtime": 32307.7024,
1481
+ "train_samples_per_second": 1.029,
1482
+ "train_steps_per_second": 0.032
1483
  }
1484
  ],
1485
  "logging_steps": 5,
1486
+ "max_steps": 1039,
1487
  "num_input_tokens_seen": 0,
1488
  "num_train_epochs": 1,
1489
  "save_steps": 100,
1490
+ "total_flos": 3.479777588753203e+16,
1491
  "train_batch_size": 4,
1492
  "trial_name": null,
1493
  "trial_params": null