dada22231 commited on
Commit
a8e9c76
·
verified ·
1 Parent(s): bd31070

Training in progress, step 6000, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e263088b00686cadca49f80a54bc46c546951592cb1ea291618c32b90cb072a4
3
  size 685354800
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fab20f32621c18f7e567e201b068fd4e502966a6cf5803e5436f2004a34e1fe
3
  size 685354800
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 4.847986852917009,
6
  "eval_steps": 500,
7
- "global_step": 5900,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -41308,6 +41308,706 @@
41308
  "learning_rate": 0.0005,
41309
  "loss": 1.3641,
41310
  "step": 5900
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41311
  }
41312
  ],
41313
  "logging_steps": 1,
@@ -41322,12 +42022,12 @@
41322
  "should_evaluate": false,
41323
  "should_log": false,
41324
  "should_save": true,
41325
- "should_training_stop": false
41326
  },
41327
  "attributes": {}
41328
  }
41329
  },
41330
- "total_flos": 1.2272956063875072e+18,
41331
  "train_batch_size": 8,
41332
  "trial_name": null,
41333
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 4.930156121610517,
6
  "eval_steps": 500,
7
+ "global_step": 6000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
41308
  "learning_rate": 0.0005,
41309
  "loss": 1.3641,
41310
  "step": 5900
41311
+ },
41312
+ {
41313
+ "epoch": 4.848808545603944,
41314
+ "grad_norm": 0.5559499859809875,
41315
+ "learning_rate": 0.0005,
41316
+ "loss": 1.3687,
41317
+ "step": 5901
41318
+ },
41319
+ {
41320
+ "epoch": 4.849630238290879,
41321
+ "grad_norm": 0.535542905330658,
41322
+ "learning_rate": 0.0005,
41323
+ "loss": 1.2836,
41324
+ "step": 5902
41325
+ },
41326
+ {
41327
+ "epoch": 4.8504519309778145,
41328
+ "grad_norm": 0.5417159199714661,
41329
+ "learning_rate": 0.0005,
41330
+ "loss": 1.3408,
41331
+ "step": 5903
41332
+ },
41333
+ {
41334
+ "epoch": 4.851273623664749,
41335
+ "grad_norm": 0.543630838394165,
41336
+ "learning_rate": 0.0005,
41337
+ "loss": 1.3996,
41338
+ "step": 5904
41339
+ },
41340
+ {
41341
+ "epoch": 4.852095316351685,
41342
+ "grad_norm": 0.5195862054824829,
41343
+ "learning_rate": 0.0005,
41344
+ "loss": 1.3169,
41345
+ "step": 5905
41346
+ },
41347
+ {
41348
+ "epoch": 4.852917009038619,
41349
+ "grad_norm": 0.5713696479797363,
41350
+ "learning_rate": 0.0005,
41351
+ "loss": 1.4069,
41352
+ "step": 5906
41353
+ },
41354
+ {
41355
+ "epoch": 4.853738701725555,
41356
+ "grad_norm": 0.5813581943511963,
41357
+ "learning_rate": 0.0005,
41358
+ "loss": 1.3037,
41359
+ "step": 5907
41360
+ },
41361
+ {
41362
+ "epoch": 4.8545603944124895,
41363
+ "grad_norm": 0.5637884140014648,
41364
+ "learning_rate": 0.0005,
41365
+ "loss": 1.3991,
41366
+ "step": 5908
41367
+ },
41368
+ {
41369
+ "epoch": 4.855382087099425,
41370
+ "grad_norm": 0.5755516290664673,
41371
+ "learning_rate": 0.0005,
41372
+ "loss": 1.4464,
41373
+ "step": 5909
41374
+ },
41375
+ {
41376
+ "epoch": 4.85620377978636,
41377
+ "grad_norm": 0.5279106497764587,
41378
+ "learning_rate": 0.0005,
41379
+ "loss": 1.3729,
41380
+ "step": 5910
41381
+ },
41382
+ {
41383
+ "epoch": 4.857025472473295,
41384
+ "grad_norm": 0.5568866729736328,
41385
+ "learning_rate": 0.0005,
41386
+ "loss": 1.3846,
41387
+ "step": 5911
41388
+ },
41389
+ {
41390
+ "epoch": 4.85784716516023,
41391
+ "grad_norm": 0.5258705615997314,
41392
+ "learning_rate": 0.0005,
41393
+ "loss": 1.3301,
41394
+ "step": 5912
41395
+ },
41396
+ {
41397
+ "epoch": 4.8586688578471655,
41398
+ "grad_norm": 0.5256040692329407,
41399
+ "learning_rate": 0.0005,
41400
+ "loss": 1.3543,
41401
+ "step": 5913
41402
+ },
41403
+ {
41404
+ "epoch": 4.8594905505341,
41405
+ "grad_norm": 0.5508636236190796,
41406
+ "learning_rate": 0.0005,
41407
+ "loss": 1.3168,
41408
+ "step": 5914
41409
+ },
41410
+ {
41411
+ "epoch": 4.860312243221035,
41412
+ "grad_norm": 0.5376938581466675,
41413
+ "learning_rate": 0.0005,
41414
+ "loss": 1.3618,
41415
+ "step": 5915
41416
+ },
41417
+ {
41418
+ "epoch": 4.86113393590797,
41419
+ "grad_norm": 0.513633668422699,
41420
+ "learning_rate": 0.0005,
41421
+ "loss": 1.279,
41422
+ "step": 5916
41423
+ },
41424
+ {
41425
+ "epoch": 4.861955628594906,
41426
+ "grad_norm": 0.5526177287101746,
41427
+ "learning_rate": 0.0005,
41428
+ "loss": 1.2778,
41429
+ "step": 5917
41430
+ },
41431
+ {
41432
+ "epoch": 4.862777321281841,
41433
+ "grad_norm": 0.5580543279647827,
41434
+ "learning_rate": 0.0005,
41435
+ "loss": 1.3931,
41436
+ "step": 5918
41437
+ },
41438
+ {
41439
+ "epoch": 4.863599013968775,
41440
+ "grad_norm": 0.5640444159507751,
41441
+ "learning_rate": 0.0005,
41442
+ "loss": 1.4172,
41443
+ "step": 5919
41444
+ },
41445
+ {
41446
+ "epoch": 4.864420706655711,
41447
+ "grad_norm": 0.5366266369819641,
41448
+ "learning_rate": 0.0005,
41449
+ "loss": 1.2472,
41450
+ "step": 5920
41451
+ },
41452
+ {
41453
+ "epoch": 4.8652423993426455,
41454
+ "grad_norm": 0.5291760563850403,
41455
+ "learning_rate": 0.0005,
41456
+ "loss": 1.2402,
41457
+ "step": 5921
41458
+ },
41459
+ {
41460
+ "epoch": 4.866064092029581,
41461
+ "grad_norm": 0.5365450978279114,
41462
+ "learning_rate": 0.0005,
41463
+ "loss": 1.2807,
41464
+ "step": 5922
41465
+ },
41466
+ {
41467
+ "epoch": 4.866885784716516,
41468
+ "grad_norm": 0.5284454822540283,
41469
+ "learning_rate": 0.0005,
41470
+ "loss": 1.4549,
41471
+ "step": 5923
41472
+ },
41473
+ {
41474
+ "epoch": 4.867707477403451,
41475
+ "grad_norm": 0.541029155254364,
41476
+ "learning_rate": 0.0005,
41477
+ "loss": 1.3154,
41478
+ "step": 5924
41479
+ },
41480
+ {
41481
+ "epoch": 4.868529170090386,
41482
+ "grad_norm": 0.5600323677062988,
41483
+ "learning_rate": 0.0005,
41484
+ "loss": 1.3268,
41485
+ "step": 5925
41486
+ },
41487
+ {
41488
+ "epoch": 4.869350862777321,
41489
+ "grad_norm": 0.5529478788375854,
41490
+ "learning_rate": 0.0005,
41491
+ "loss": 1.3629,
41492
+ "step": 5926
41493
+ },
41494
+ {
41495
+ "epoch": 4.870172555464256,
41496
+ "grad_norm": 0.5471394658088684,
41497
+ "learning_rate": 0.0005,
41498
+ "loss": 1.2708,
41499
+ "step": 5927
41500
+ },
41501
+ {
41502
+ "epoch": 4.870994248151192,
41503
+ "grad_norm": 0.5359401702880859,
41504
+ "learning_rate": 0.0005,
41505
+ "loss": 1.3363,
41506
+ "step": 5928
41507
+ },
41508
+ {
41509
+ "epoch": 4.871815940838126,
41510
+ "grad_norm": 0.5148870348930359,
41511
+ "learning_rate": 0.0005,
41512
+ "loss": 1.2534,
41513
+ "step": 5929
41514
+ },
41515
+ {
41516
+ "epoch": 4.872637633525062,
41517
+ "grad_norm": 0.5293238759040833,
41518
+ "learning_rate": 0.0005,
41519
+ "loss": 1.3152,
41520
+ "step": 5930
41521
+ },
41522
+ {
41523
+ "epoch": 4.8734593262119965,
41524
+ "grad_norm": 0.5174310207366943,
41525
+ "learning_rate": 0.0005,
41526
+ "loss": 1.3534,
41527
+ "step": 5931
41528
+ },
41529
+ {
41530
+ "epoch": 4.874281018898932,
41531
+ "grad_norm": 0.5341978073120117,
41532
+ "learning_rate": 0.0005,
41533
+ "loss": 1.3888,
41534
+ "step": 5932
41535
+ },
41536
+ {
41537
+ "epoch": 4.875102711585867,
41538
+ "grad_norm": 0.5729701519012451,
41539
+ "learning_rate": 0.0005,
41540
+ "loss": 1.4173,
41541
+ "step": 5933
41542
+ },
41543
+ {
41544
+ "epoch": 4.875924404272802,
41545
+ "grad_norm": 0.6612470746040344,
41546
+ "learning_rate": 0.0005,
41547
+ "loss": 1.3927,
41548
+ "step": 5934
41549
+ },
41550
+ {
41551
+ "epoch": 4.876746096959737,
41552
+ "grad_norm": 0.5212023258209229,
41553
+ "learning_rate": 0.0005,
41554
+ "loss": 1.3259,
41555
+ "step": 5935
41556
+ },
41557
+ {
41558
+ "epoch": 4.8775677896466725,
41559
+ "grad_norm": 0.5570096969604492,
41560
+ "learning_rate": 0.0005,
41561
+ "loss": 1.2884,
41562
+ "step": 5936
41563
+ },
41564
+ {
41565
+ "epoch": 4.878389482333607,
41566
+ "grad_norm": 0.5613424181938171,
41567
+ "learning_rate": 0.0005,
41568
+ "loss": 1.2463,
41569
+ "step": 5937
41570
+ },
41571
+ {
41572
+ "epoch": 4.879211175020543,
41573
+ "grad_norm": 0.5613168478012085,
41574
+ "learning_rate": 0.0005,
41575
+ "loss": 1.3115,
41576
+ "step": 5938
41577
+ },
41578
+ {
41579
+ "epoch": 4.880032867707477,
41580
+ "grad_norm": 0.535275936126709,
41581
+ "learning_rate": 0.0005,
41582
+ "loss": 1.3002,
41583
+ "step": 5939
41584
+ },
41585
+ {
41586
+ "epoch": 4.880854560394413,
41587
+ "grad_norm": 0.5672900676727295,
41588
+ "learning_rate": 0.0005,
41589
+ "loss": 1.3669,
41590
+ "step": 5940
41591
+ },
41592
+ {
41593
+ "epoch": 4.881676253081348,
41594
+ "grad_norm": 0.5469388961791992,
41595
+ "learning_rate": 0.0005,
41596
+ "loss": 1.3652,
41597
+ "step": 5941
41598
+ },
41599
+ {
41600
+ "epoch": 4.882497945768282,
41601
+ "grad_norm": 0.529625415802002,
41602
+ "learning_rate": 0.0005,
41603
+ "loss": 1.4422,
41604
+ "step": 5942
41605
+ },
41606
+ {
41607
+ "epoch": 4.883319638455218,
41608
+ "grad_norm": 0.5889802575111389,
41609
+ "learning_rate": 0.0005,
41610
+ "loss": 1.3788,
41611
+ "step": 5943
41612
+ },
41613
+ {
41614
+ "epoch": 4.884141331142153,
41615
+ "grad_norm": 0.5382450819015503,
41616
+ "learning_rate": 0.0005,
41617
+ "loss": 1.3318,
41618
+ "step": 5944
41619
+ },
41620
+ {
41621
+ "epoch": 4.884963023829088,
41622
+ "grad_norm": 0.5242584347724915,
41623
+ "learning_rate": 0.0005,
41624
+ "loss": 1.268,
41625
+ "step": 5945
41626
+ },
41627
+ {
41628
+ "epoch": 4.885784716516023,
41629
+ "grad_norm": 0.5483070611953735,
41630
+ "learning_rate": 0.0005,
41631
+ "loss": 1.4016,
41632
+ "step": 5946
41633
+ },
41634
+ {
41635
+ "epoch": 4.886606409202958,
41636
+ "grad_norm": 0.5779204368591309,
41637
+ "learning_rate": 0.0005,
41638
+ "loss": 1.327,
41639
+ "step": 5947
41640
+ },
41641
+ {
41642
+ "epoch": 4.887428101889893,
41643
+ "grad_norm": 0.5551186800003052,
41644
+ "learning_rate": 0.0005,
41645
+ "loss": 1.4228,
41646
+ "step": 5948
41647
+ },
41648
+ {
41649
+ "epoch": 4.888249794576828,
41650
+ "grad_norm": 0.5995270609855652,
41651
+ "learning_rate": 0.0005,
41652
+ "loss": 1.3831,
41653
+ "step": 5949
41654
+ },
41655
+ {
41656
+ "epoch": 4.889071487263763,
41657
+ "grad_norm": 0.5424328446388245,
41658
+ "learning_rate": 0.0005,
41659
+ "loss": 1.3815,
41660
+ "step": 5950
41661
+ },
41662
+ {
41663
+ "epoch": 4.889893179950699,
41664
+ "grad_norm": 0.5349864959716797,
41665
+ "learning_rate": 0.0005,
41666
+ "loss": 1.2749,
41667
+ "step": 5951
41668
+ },
41669
+ {
41670
+ "epoch": 4.890714872637633,
41671
+ "grad_norm": 0.5398481488227844,
41672
+ "learning_rate": 0.0005,
41673
+ "loss": 1.3635,
41674
+ "step": 5952
41675
+ },
41676
+ {
41677
+ "epoch": 4.891536565324569,
41678
+ "grad_norm": 0.5872131586074829,
41679
+ "learning_rate": 0.0005,
41680
+ "loss": 1.3778,
41681
+ "step": 5953
41682
+ },
41683
+ {
41684
+ "epoch": 4.8923582580115035,
41685
+ "grad_norm": 0.5435046553611755,
41686
+ "learning_rate": 0.0005,
41687
+ "loss": 1.3834,
41688
+ "step": 5954
41689
+ },
41690
+ {
41691
+ "epoch": 4.893179950698439,
41692
+ "grad_norm": 0.551789402961731,
41693
+ "learning_rate": 0.0005,
41694
+ "loss": 1.3315,
41695
+ "step": 5955
41696
+ },
41697
+ {
41698
+ "epoch": 4.894001643385374,
41699
+ "grad_norm": 0.5277841687202454,
41700
+ "learning_rate": 0.0005,
41701
+ "loss": 1.4072,
41702
+ "step": 5956
41703
+ },
41704
+ {
41705
+ "epoch": 4.894823336072309,
41706
+ "grad_norm": 0.5183998346328735,
41707
+ "learning_rate": 0.0005,
41708
+ "loss": 1.2465,
41709
+ "step": 5957
41710
+ },
41711
+ {
41712
+ "epoch": 4.895645028759244,
41713
+ "grad_norm": 0.561173677444458,
41714
+ "learning_rate": 0.0005,
41715
+ "loss": 1.4095,
41716
+ "step": 5958
41717
+ },
41718
+ {
41719
+ "epoch": 4.8964667214461794,
41720
+ "grad_norm": 0.5499217510223389,
41721
+ "learning_rate": 0.0005,
41722
+ "loss": 1.312,
41723
+ "step": 5959
41724
+ },
41725
+ {
41726
+ "epoch": 4.897288414133114,
41727
+ "grad_norm": 0.534344494342804,
41728
+ "learning_rate": 0.0005,
41729
+ "loss": 1.326,
41730
+ "step": 5960
41731
+ },
41732
+ {
41733
+ "epoch": 4.89811010682005,
41734
+ "grad_norm": 0.5524152517318726,
41735
+ "learning_rate": 0.0005,
41736
+ "loss": 1.3709,
41737
+ "step": 5961
41738
+ },
41739
+ {
41740
+ "epoch": 4.898931799506984,
41741
+ "grad_norm": 0.5235154032707214,
41742
+ "learning_rate": 0.0005,
41743
+ "loss": 1.212,
41744
+ "step": 5962
41745
+ },
41746
+ {
41747
+ "epoch": 4.89975349219392,
41748
+ "grad_norm": 0.5188632607460022,
41749
+ "learning_rate": 0.0005,
41750
+ "loss": 1.2143,
41751
+ "step": 5963
41752
+ },
41753
+ {
41754
+ "epoch": 4.9005751848808545,
41755
+ "grad_norm": 0.5360555648803711,
41756
+ "learning_rate": 0.0005,
41757
+ "loss": 1.4402,
41758
+ "step": 5964
41759
+ },
41760
+ {
41761
+ "epoch": 4.901396877567789,
41762
+ "grad_norm": 0.5215834379196167,
41763
+ "learning_rate": 0.0005,
41764
+ "loss": 1.2509,
41765
+ "step": 5965
41766
+ },
41767
+ {
41768
+ "epoch": 4.902218570254725,
41769
+ "grad_norm": 0.526918888092041,
41770
+ "learning_rate": 0.0005,
41771
+ "loss": 1.2802,
41772
+ "step": 5966
41773
+ },
41774
+ {
41775
+ "epoch": 4.90304026294166,
41776
+ "grad_norm": 0.5387428402900696,
41777
+ "learning_rate": 0.0005,
41778
+ "loss": 1.294,
41779
+ "step": 5967
41780
+ },
41781
+ {
41782
+ "epoch": 4.903861955628595,
41783
+ "grad_norm": 0.5555245280265808,
41784
+ "learning_rate": 0.0005,
41785
+ "loss": 1.4213,
41786
+ "step": 5968
41787
+ },
41788
+ {
41789
+ "epoch": 4.90468364831553,
41790
+ "grad_norm": 0.5404963493347168,
41791
+ "learning_rate": 0.0005,
41792
+ "loss": 1.227,
41793
+ "step": 5969
41794
+ },
41795
+ {
41796
+ "epoch": 4.905505341002465,
41797
+ "grad_norm": 0.5623646974563599,
41798
+ "learning_rate": 0.0005,
41799
+ "loss": 1.4319,
41800
+ "step": 5970
41801
+ },
41802
+ {
41803
+ "epoch": 4.9063270336894,
41804
+ "grad_norm": 0.5389553904533386,
41805
+ "learning_rate": 0.0005,
41806
+ "loss": 1.297,
41807
+ "step": 5971
41808
+ },
41809
+ {
41810
+ "epoch": 4.907148726376335,
41811
+ "grad_norm": 0.5477744936943054,
41812
+ "learning_rate": 0.0005,
41813
+ "loss": 1.3659,
41814
+ "step": 5972
41815
+ },
41816
+ {
41817
+ "epoch": 4.90797041906327,
41818
+ "grad_norm": 0.5301917791366577,
41819
+ "learning_rate": 0.0005,
41820
+ "loss": 1.2658,
41821
+ "step": 5973
41822
+ },
41823
+ {
41824
+ "epoch": 4.908792111750206,
41825
+ "grad_norm": 0.5291617512702942,
41826
+ "learning_rate": 0.0005,
41827
+ "loss": 1.3592,
41828
+ "step": 5974
41829
+ },
41830
+ {
41831
+ "epoch": 4.90961380443714,
41832
+ "grad_norm": 0.5419930815696716,
41833
+ "learning_rate": 0.0005,
41834
+ "loss": 1.3125,
41835
+ "step": 5975
41836
+ },
41837
+ {
41838
+ "epoch": 4.910435497124076,
41839
+ "grad_norm": 0.5221468210220337,
41840
+ "learning_rate": 0.0005,
41841
+ "loss": 1.3643,
41842
+ "step": 5976
41843
+ },
41844
+ {
41845
+ "epoch": 4.9112571898110104,
41846
+ "grad_norm": 0.5470991134643555,
41847
+ "learning_rate": 0.0005,
41848
+ "loss": 1.3461,
41849
+ "step": 5977
41850
+ },
41851
+ {
41852
+ "epoch": 4.912078882497946,
41853
+ "grad_norm": 0.5354620814323425,
41854
+ "learning_rate": 0.0005,
41855
+ "loss": 1.2681,
41856
+ "step": 5978
41857
+ },
41858
+ {
41859
+ "epoch": 4.912900575184881,
41860
+ "grad_norm": 0.5851417779922485,
41861
+ "learning_rate": 0.0005,
41862
+ "loss": 1.3237,
41863
+ "step": 5979
41864
+ },
41865
+ {
41866
+ "epoch": 4.913722267871816,
41867
+ "grad_norm": 0.5492926239967346,
41868
+ "learning_rate": 0.0005,
41869
+ "loss": 1.417,
41870
+ "step": 5980
41871
+ },
41872
+ {
41873
+ "epoch": 4.914543960558751,
41874
+ "grad_norm": 0.5446394681930542,
41875
+ "learning_rate": 0.0005,
41876
+ "loss": 1.275,
41877
+ "step": 5981
41878
+ },
41879
+ {
41880
+ "epoch": 4.915365653245686,
41881
+ "grad_norm": 0.5484851002693176,
41882
+ "learning_rate": 0.0005,
41883
+ "loss": 1.3842,
41884
+ "step": 5982
41885
+ },
41886
+ {
41887
+ "epoch": 4.916187345932621,
41888
+ "grad_norm": 0.5432127714157104,
41889
+ "learning_rate": 0.0005,
41890
+ "loss": 1.4247,
41891
+ "step": 5983
41892
+ },
41893
+ {
41894
+ "epoch": 4.917009038619557,
41895
+ "grad_norm": 0.5324352979660034,
41896
+ "learning_rate": 0.0005,
41897
+ "loss": 1.3269,
41898
+ "step": 5984
41899
+ },
41900
+ {
41901
+ "epoch": 4.917830731306491,
41902
+ "grad_norm": 0.5508584976196289,
41903
+ "learning_rate": 0.0005,
41904
+ "loss": 1.1933,
41905
+ "step": 5985
41906
+ },
41907
+ {
41908
+ "epoch": 4.918652423993427,
41909
+ "grad_norm": 0.5484975576400757,
41910
+ "learning_rate": 0.0005,
41911
+ "loss": 1.2294,
41912
+ "step": 5986
41913
+ },
41914
+ {
41915
+ "epoch": 4.9194741166803615,
41916
+ "grad_norm": 0.5371730923652649,
41917
+ "learning_rate": 0.0005,
41918
+ "loss": 1.293,
41919
+ "step": 5987
41920
+ },
41921
+ {
41922
+ "epoch": 4.920295809367296,
41923
+ "grad_norm": 0.5321599841117859,
41924
+ "learning_rate": 0.0005,
41925
+ "loss": 1.3553,
41926
+ "step": 5988
41927
+ },
41928
+ {
41929
+ "epoch": 4.921117502054232,
41930
+ "grad_norm": 0.5347539186477661,
41931
+ "learning_rate": 0.0005,
41932
+ "loss": 1.3904,
41933
+ "step": 5989
41934
+ },
41935
+ {
41936
+ "epoch": 4.921939194741167,
41937
+ "grad_norm": 0.5540315508842468,
41938
+ "learning_rate": 0.0005,
41939
+ "loss": 1.3358,
41940
+ "step": 5990
41941
+ },
41942
+ {
41943
+ "epoch": 4.922760887428102,
41944
+ "grad_norm": 0.543171226978302,
41945
+ "learning_rate": 0.0005,
41946
+ "loss": 1.3914,
41947
+ "step": 5991
41948
+ },
41949
+ {
41950
+ "epoch": 4.923582580115037,
41951
+ "grad_norm": 0.5221793055534363,
41952
+ "learning_rate": 0.0005,
41953
+ "loss": 1.3852,
41954
+ "step": 5992
41955
+ },
41956
+ {
41957
+ "epoch": 4.924404272801972,
41958
+ "grad_norm": 0.5648449659347534,
41959
+ "learning_rate": 0.0005,
41960
+ "loss": 1.3695,
41961
+ "step": 5993
41962
+ },
41963
+ {
41964
+ "epoch": 4.925225965488907,
41965
+ "grad_norm": 0.5360020995140076,
41966
+ "learning_rate": 0.0005,
41967
+ "loss": 1.296,
41968
+ "step": 5994
41969
+ },
41970
+ {
41971
+ "epoch": 4.926047658175842,
41972
+ "grad_norm": 0.5223022699356079,
41973
+ "learning_rate": 0.0005,
41974
+ "loss": 1.2654,
41975
+ "step": 5995
41976
+ },
41977
+ {
41978
+ "epoch": 4.926869350862777,
41979
+ "grad_norm": 0.5244916677474976,
41980
+ "learning_rate": 0.0005,
41981
+ "loss": 1.2287,
41982
+ "step": 5996
41983
+ },
41984
+ {
41985
+ "epoch": 4.9276910435497125,
41986
+ "grad_norm": 0.5709188580513,
41987
+ "learning_rate": 0.0005,
41988
+ "loss": 1.414,
41989
+ "step": 5997
41990
+ },
41991
+ {
41992
+ "epoch": 4.928512736236647,
41993
+ "grad_norm": 0.5473321080207825,
41994
+ "learning_rate": 0.0005,
41995
+ "loss": 1.3689,
41996
+ "step": 5998
41997
+ },
41998
+ {
41999
+ "epoch": 4.929334428923583,
42000
+ "grad_norm": 0.5459017157554626,
42001
+ "learning_rate": 0.0005,
42002
+ "loss": 1.2876,
42003
+ "step": 5999
42004
+ },
42005
+ {
42006
+ "epoch": 4.930156121610517,
42007
+ "grad_norm": 0.5736708641052246,
42008
+ "learning_rate": 0.0005,
42009
+ "loss": 1.3623,
42010
+ "step": 6000
42011
  }
42012
  ],
42013
  "logging_steps": 1,
 
42022
  "should_evaluate": false,
42023
  "should_log": false,
42024
  "should_save": true,
42025
+ "should_training_stop": true
42026
  },
42027
  "attributes": {}
42028
  }
42029
  },
42030
+ "total_flos": 1.2481056037404672e+18,
42031
  "train_batch_size": 8,
42032
  "trial_name": null,
42033
  "trial_params": null