souging commited on
Commit
9f80601
·
verified ·
1 Parent(s): 15a9293

Training in progress, epoch 4, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:904e16deb513a35038d85d18acbbe70eb2a9a88a949d4e242191093354bc7ee9
3
  size 97307544
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a29ac7ba6f53f92462f92ab317dac22f616978288aa0a0bf5eb7b73a93e8623b
3
  size 97307544
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b496d457f68a9e27cc3b0313db1c46c1b597fc4997bd077b30a5ba470ce47670
3
  size 49846644
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87a7f4d414d1b3ef9f1439d2a5c336e25a673350c1f7c1a36bae8691277656ef
3
  size 49846644
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e1863d6014d5991fa8a2a5dcd1a9a6b77779d54615a994dfbb39072ac687e102
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b34c69f6d6c32635ff50b0a1faed18792675796ed61b4917357f7fa723205e2f
3
  size 15984
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:03acca099ba87451bc25add0f213c9c00bdab561a859daba65132f8169778e51
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f00b65fe360af6cac79aca5d512b7d43a8aea8109a2419986cb6ec4493d90572
3
  size 15984
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bd7a2301e1ee44f778f53965af9ae25b988a41de89c7ca647572a5de0e8c6035
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3338a4e0a24655344a2e4fe71d8047a4351bfcfb4de0042932ed9ef74e6a9a04
3
  size 15984
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7842500365640c8726ea61934b43e988650a0b36c7a4e82c0ea9f0d0018c4f85
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a28ffcb98b6e03917e67aa0d155cd2bcb5bba7adff5ab9f65b2a312750f07526
3
  size 15984
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a8d0a0263d1fd034f3144d5c66fc4c0a4b6be58288bb61fa17e67b12787d7875
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f34acd8813d6b9c98be80489c6f644fc7604245d4167815c19292650ca21464
3
  size 15984
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:46820b55d370e4b0e9905bb63c8f603d94f0d276d56614f9a9ee3a867d480adc
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:359fcea393c9e357ac9329090b6808db28e869fcad41f7d4e9d5f4ea556aac70
3
  size 15984
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:60c418dcc06796fbd335ec9646dd36aa0f8291ec906e908fd31e851ed5f719f3
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9ec9b30c1aa25c7c28c666461ba2a0d9c5781d925d9a3f624531498737156b1
3
  size 15984
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:211c5719ddd067d69163fe8181fd284d3e39fcb6e07d40465148b61574e89c60
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b73a0419b8ce412c9ab0e7b0fff0b7e12dd9a30c2645954ee9b29ede96197d8
3
  size 15984
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a1524da22a5dd2627500416ed0839a8d36e930bc3603327e51a5aed9de4b093b
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3b5e41a3049ed9c6f775655d186804bf7846bdec4fbbd638a092baab2602b8a
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 4.0,
5
  "eval_steps": 500,
6
- "global_step": 533,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -3738,6 +3738,937 @@
3738
  "learning_rate": 3.621358721151505e-05,
3739
  "loss": 1.8768,
3740
  "step": 533
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3741
  }
3742
  ],
3743
  "logging_steps": 1,
@@ -3757,7 +4688,7 @@
3757
  "attributes": {}
3758
  }
3759
  },
3760
- "total_flos": 1.170847051450155e+18,
3761
  "train_batch_size": 6,
3762
  "trial_name": null,
3763
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 4.99812382739212,
5
  "eval_steps": 500,
6
+ "global_step": 666,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
3738
  "learning_rate": 3.621358721151505e-05,
3739
  "loss": 1.8768,
3740
  "step": 533
3741
+ },
3742
+ {
3743
+ "epoch": 4.00750469043152,
3744
+ "grad_norm": 0.23720650374889374,
3745
+ "learning_rate": 3.5808773539880973e-05,
3746
+ "loss": 1.0723,
3747
+ "step": 534
3748
+ },
3749
+ {
3750
+ "epoch": 4.01500938086304,
3751
+ "grad_norm": 0.24943062663078308,
3752
+ "learning_rate": 3.540574711865146e-05,
3753
+ "loss": 0.9952,
3754
+ "step": 535
3755
+ },
3756
+ {
3757
+ "epoch": 4.022514071294559,
3758
+ "grad_norm": 0.25107982754707336,
3759
+ "learning_rate": 3.500451899699935e-05,
3760
+ "loss": 1.0325,
3761
+ "step": 536
3762
+ },
3763
+ {
3764
+ "epoch": 4.030018761726079,
3765
+ "grad_norm": 0.2480771690607071,
3766
+ "learning_rate": 3.460510017479631e-05,
3767
+ "loss": 1.0892,
3768
+ "step": 537
3769
+ },
3770
+ {
3771
+ "epoch": 4.037523452157599,
3772
+ "grad_norm": 0.2446199208498001,
3773
+ "learning_rate": 3.420750160231118e-05,
3774
+ "loss": 1.1082,
3775
+ "step": 538
3776
+ },
3777
+ {
3778
+ "epoch": 4.045028142589119,
3779
+ "grad_norm": 0.2848975956439972,
3780
+ "learning_rate": 3.381173417990957e-05,
3781
+ "loss": 1.0322,
3782
+ "step": 539
3783
+ },
3784
+ {
3785
+ "epoch": 4.052532833020638,
3786
+ "grad_norm": 0.2551771104335785,
3787
+ "learning_rate": 3.3417808757755355e-05,
3788
+ "loss": 1.029,
3789
+ "step": 540
3790
+ },
3791
+ {
3792
+ "epoch": 4.0600375234521575,
3793
+ "grad_norm": 0.2457280158996582,
3794
+ "learning_rate": 3.302573613551292e-05,
3795
+ "loss": 1.0687,
3796
+ "step": 541
3797
+ },
3798
+ {
3799
+ "epoch": 4.067542213883677,
3800
+ "grad_norm": 0.23483915627002716,
3801
+ "learning_rate": 3.263552706205128e-05,
3802
+ "loss": 1.0712,
3803
+ "step": 542
3804
+ },
3805
+ {
3806
+ "epoch": 4.075046904315197,
3807
+ "grad_norm": 0.2625795602798462,
3808
+ "learning_rate": 3.22471922351493e-05,
3809
+ "loss": 1.0424,
3810
+ "step": 543
3811
+ },
3812
+ {
3813
+ "epoch": 4.082551594746716,
3814
+ "grad_norm": 0.25196078419685364,
3815
+ "learning_rate": 3.186074230120244e-05,
3816
+ "loss": 0.9866,
3817
+ "step": 544
3818
+ },
3819
+ {
3820
+ "epoch": 4.090056285178236,
3821
+ "grad_norm": 0.26508569717407227,
3822
+ "learning_rate": 3.147618785493083e-05,
3823
+ "loss": 1.0696,
3824
+ "step": 545
3825
+ },
3826
+ {
3827
+ "epoch": 4.097560975609756,
3828
+ "grad_norm": 0.26318731904029846,
3829
+ "learning_rate": 3.109353943908893e-05,
3830
+ "loss": 1.1073,
3831
+ "step": 546
3832
+ },
3833
+ {
3834
+ "epoch": 4.105065666041276,
3835
+ "grad_norm": 0.2586771249771118,
3836
+ "learning_rate": 3.071280754417626e-05,
3837
+ "loss": 1.0968,
3838
+ "step": 547
3839
+ },
3840
+ {
3841
+ "epoch": 4.112570356472795,
3842
+ "grad_norm": 0.26417478919029236,
3843
+ "learning_rate": 3.033400260815008e-05,
3844
+ "loss": 1.0892,
3845
+ "step": 548
3846
+ },
3847
+ {
3848
+ "epoch": 4.120075046904315,
3849
+ "grad_norm": 0.25766685605049133,
3850
+ "learning_rate": 2.9957135016139122e-05,
3851
+ "loss": 0.9827,
3852
+ "step": 549
3853
+ },
3854
+ {
3855
+ "epoch": 4.127579737335835,
3856
+ "grad_norm": 0.2724906802177429,
3857
+ "learning_rate": 2.9582215100158706e-05,
3858
+ "loss": 1.1577,
3859
+ "step": 550
3860
+ },
3861
+ {
3862
+ "epoch": 4.135084427767355,
3863
+ "grad_norm": 0.24692752957344055,
3864
+ "learning_rate": 2.920925313882776e-05,
3865
+ "loss": 1.1081,
3866
+ "step": 551
3867
+ },
3868
+ {
3869
+ "epoch": 4.142589118198874,
3870
+ "grad_norm": 0.27086886763572693,
3871
+ "learning_rate": 2.8838259357086884e-05,
3872
+ "loss": 0.9835,
3873
+ "step": 552
3874
+ },
3875
+ {
3876
+ "epoch": 4.150093808630394,
3877
+ "grad_norm": 0.25520050525665283,
3878
+ "learning_rate": 2.846924392591794e-05,
3879
+ "loss": 1.0608,
3880
+ "step": 553
3881
+ },
3882
+ {
3883
+ "epoch": 4.157598499061914,
3884
+ "grad_norm": 0.2637060582637787,
3885
+ "learning_rate": 2.8102216962065423e-05,
3886
+ "loss": 0.9214,
3887
+ "step": 554
3888
+ },
3889
+ {
3890
+ "epoch": 4.165103189493434,
3891
+ "grad_norm": 0.2559458613395691,
3892
+ "learning_rate": 2.7737188527758972e-05,
3893
+ "loss": 0.9785,
3894
+ "step": 555
3895
+ },
3896
+ {
3897
+ "epoch": 4.1726078799249535,
3898
+ "grad_norm": 0.2692560851573944,
3899
+ "learning_rate": 2.7374168630437456e-05,
3900
+ "loss": 1.1126,
3901
+ "step": 556
3902
+ },
3903
+ {
3904
+ "epoch": 4.1801125703564725,
3905
+ "grad_norm": 0.26712581515312195,
3906
+ "learning_rate": 2.7013167222474756e-05,
3907
+ "loss": 0.9964,
3908
+ "step": 557
3909
+ },
3910
+ {
3911
+ "epoch": 4.1876172607879925,
3912
+ "grad_norm": 0.269401878118515,
3913
+ "learning_rate": 2.6654194200906833e-05,
3914
+ "loss": 1.0647,
3915
+ "step": 558
3916
+ },
3917
+ {
3918
+ "epoch": 4.195121951219512,
3919
+ "grad_norm": 0.2693142294883728,
3920
+ "learning_rate": 2.629725940716041e-05,
3921
+ "loss": 1.0686,
3922
+ "step": 559
3923
+ },
3924
+ {
3925
+ "epoch": 4.202626641651032,
3926
+ "grad_norm": 0.23360580205917358,
3927
+ "learning_rate": 2.5942372626783172e-05,
3928
+ "loss": 0.9899,
3929
+ "step": 560
3930
+ },
3931
+ {
3932
+ "epoch": 4.210131332082551,
3933
+ "grad_norm": 0.27039438486099243,
3934
+ "learning_rate": 2.5589543589175485e-05,
3935
+ "loss": 1.1598,
3936
+ "step": 561
3937
+ },
3938
+ {
3939
+ "epoch": 4.217636022514071,
3940
+ "grad_norm": 0.2767198979854584,
3941
+ "learning_rate": 2.523878196732358e-05,
3942
+ "loss": 0.9577,
3943
+ "step": 562
3944
+ },
3945
+ {
3946
+ "epoch": 4.225140712945591,
3947
+ "grad_norm": 0.27451032400131226,
3948
+ "learning_rate": 2.489009737753459e-05,
3949
+ "loss": 1.2496,
3950
+ "step": 563
3951
+ },
3952
+ {
3953
+ "epoch": 4.232645403377111,
3954
+ "grad_norm": 0.24222443997859955,
3955
+ "learning_rate": 2.4543499379172615e-05,
3956
+ "loss": 1.018,
3957
+ "step": 564
3958
+ },
3959
+ {
3960
+ "epoch": 4.24015009380863,
3961
+ "grad_norm": 0.26674020290374756,
3962
+ "learning_rate": 2.4198997474396877e-05,
3963
+ "loss": 1.0977,
3964
+ "step": 565
3965
+ },
3966
+ {
3967
+ "epoch": 4.24765478424015,
3968
+ "grad_norm": 0.2519540786743164,
3969
+ "learning_rate": 2.3856601107901166e-05,
3970
+ "loss": 1.0182,
3971
+ "step": 566
3972
+ },
3973
+ {
3974
+ "epoch": 4.25515947467167,
3975
+ "grad_norm": 0.2805282771587372,
3976
+ "learning_rate": 2.351631966665476e-05,
3977
+ "loss": 0.9412,
3978
+ "step": 567
3979
+ },
3980
+ {
3981
+ "epoch": 4.26266416510319,
3982
+ "grad_norm": 0.25769346952438354,
3983
+ "learning_rate": 2.31781624796453e-05,
3984
+ "loss": 1.0087,
3985
+ "step": 568
3986
+ },
3987
+ {
3988
+ "epoch": 4.270168855534709,
3989
+ "grad_norm": 0.25440049171447754,
3990
+ "learning_rate": 2.2842138817622883e-05,
3991
+ "loss": 1.0862,
3992
+ "step": 569
3993
+ },
3994
+ {
3995
+ "epoch": 4.277673545966229,
3996
+ "grad_norm": 0.25586551427841187,
3997
+ "learning_rate": 2.250825789284594e-05,
3998
+ "loss": 1.0173,
3999
+ "step": 570
4000
+ },
4001
+ {
4002
+ "epoch": 4.285178236397749,
4003
+ "grad_norm": 0.2734562158584595,
4004
+ "learning_rate": 2.217652885882869e-05,
4005
+ "loss": 1.0565,
4006
+ "step": 571
4007
+ },
4008
+ {
4009
+ "epoch": 4.2926829268292686,
4010
+ "grad_norm": 0.2658538818359375,
4011
+ "learning_rate": 2.1846960810090188e-05,
4012
+ "loss": 1.0235,
4013
+ "step": 572
4014
+ },
4015
+ {
4016
+ "epoch": 4.300187617260788,
4017
+ "grad_norm": 0.2710430920124054,
4018
+ "learning_rate": 2.151956278190494e-05,
4019
+ "loss": 1.0054,
4020
+ "step": 573
4021
+ },
4022
+ {
4023
+ "epoch": 4.3076923076923075,
4024
+ "grad_norm": 0.2484230250120163,
4025
+ "learning_rate": 2.119434375005527e-05,
4026
+ "loss": 0.9814,
4027
+ "step": 574
4028
+ },
4029
+ {
4030
+ "epoch": 4.315196998123827,
4031
+ "grad_norm": 0.27289608120918274,
4032
+ "learning_rate": 2.087131263058526e-05,
4033
+ "loss": 1.0434,
4034
+ "step": 575
4035
+ },
4036
+ {
4037
+ "epoch": 4.322701688555347,
4038
+ "grad_norm": 0.25142380595207214,
4039
+ "learning_rate": 2.055047827955618e-05,
4040
+ "loss": 0.996,
4041
+ "step": 576
4042
+ },
4043
+ {
4044
+ "epoch": 4.330206378986867,
4045
+ "grad_norm": 0.281368225812912,
4046
+ "learning_rate": 2.0231849492803852e-05,
4047
+ "loss": 1.0575,
4048
+ "step": 577
4049
+ },
4050
+ {
4051
+ "epoch": 4.337711069418386,
4052
+ "grad_norm": 0.28785380721092224,
4053
+ "learning_rate": 1.991543500569745e-05,
4054
+ "loss": 1.0859,
4055
+ "step": 578
4056
+ },
4057
+ {
4058
+ "epoch": 4.345215759849906,
4059
+ "grad_norm": 0.28092002868652344,
4060
+ "learning_rate": 1.960124349289992e-05,
4061
+ "loss": 1.1349,
4062
+ "step": 579
4063
+ },
4064
+ {
4065
+ "epoch": 4.352720450281426,
4066
+ "grad_norm": 0.2787632644176483,
4067
+ "learning_rate": 1.928928356813032e-05,
4068
+ "loss": 1.0366,
4069
+ "step": 580
4070
+ },
4071
+ {
4072
+ "epoch": 4.360225140712946,
4073
+ "grad_norm": 0.2454954981803894,
4074
+ "learning_rate": 1.8979563783927565e-05,
4075
+ "loss": 0.9675,
4076
+ "step": 581
4077
+ },
4078
+ {
4079
+ "epoch": 4.367729831144465,
4080
+ "grad_norm": 0.2587539851665497,
4081
+ "learning_rate": 1.8672092631416013e-05,
4082
+ "loss": 1.0678,
4083
+ "step": 582
4084
+ },
4085
+ {
4086
+ "epoch": 4.375234521575985,
4087
+ "grad_norm": 0.2558777928352356,
4088
+ "learning_rate": 1.8366878540072614e-05,
4089
+ "loss": 0.9883,
4090
+ "step": 583
4091
+ },
4092
+ {
4093
+ "epoch": 4.382739212007505,
4094
+ "grad_norm": 0.2781766355037689,
4095
+ "learning_rate": 1.8063929877495892e-05,
4096
+ "loss": 0.9614,
4097
+ "step": 584
4098
+ },
4099
+ {
4100
+ "epoch": 4.390243902439025,
4101
+ "grad_norm": 0.3783913254737854,
4102
+ "learning_rate": 1.7763254949176414e-05,
4103
+ "loss": 1.0883,
4104
+ "step": 585
4105
+ },
4106
+ {
4107
+ "epoch": 4.397748592870544,
4108
+ "grad_norm": 0.25674161314964294,
4109
+ "learning_rate": 1.7464861998269243e-05,
4110
+ "loss": 1.03,
4111
+ "step": 586
4112
+ },
4113
+ {
4114
+ "epoch": 4.405253283302064,
4115
+ "grad_norm": 0.2544950842857361,
4116
+ "learning_rate": 1.7168759205367893e-05,
4117
+ "loss": 1.0044,
4118
+ "step": 587
4119
+ },
4120
+ {
4121
+ "epoch": 4.412757973733584,
4122
+ "grad_norm": 0.2635541558265686,
4123
+ "learning_rate": 1.6874954688279956e-05,
4124
+ "loss": 0.9767,
4125
+ "step": 588
4126
+ },
4127
+ {
4128
+ "epoch": 4.4202626641651035,
4129
+ "grad_norm": 0.27090808749198914,
4130
+ "learning_rate": 1.6583456501804725e-05,
4131
+ "loss": 0.9826,
4132
+ "step": 589
4133
+ },
4134
+ {
4135
+ "epoch": 4.4277673545966225,
4136
+ "grad_norm": 0.2597501277923584,
4137
+ "learning_rate": 1.6294272637512183e-05,
4138
+ "loss": 1.0187,
4139
+ "step": 590
4140
+ },
4141
+ {
4142
+ "epoch": 4.435272045028142,
4143
+ "grad_norm": 0.28622573614120483,
4144
+ "learning_rate": 1.600741102352409e-05,
4145
+ "loss": 1.0623,
4146
+ "step": 591
4147
+ },
4148
+ {
4149
+ "epoch": 4.442776735459662,
4150
+ "grad_norm": 0.25954049825668335,
4151
+ "learning_rate": 1.57228795242965e-05,
4152
+ "loss": 1.0268,
4153
+ "step": 592
4154
+ },
4155
+ {
4156
+ "epoch": 4.450281425891182,
4157
+ "grad_norm": 0.2589193284511566,
4158
+ "learning_rate": 1.544068594040417e-05,
4159
+ "loss": 0.9672,
4160
+ "step": 593
4161
+ },
4162
+ {
4163
+ "epoch": 4.457786116322701,
4164
+ "grad_norm": 0.2728925943374634,
4165
+ "learning_rate": 1.516083800832676e-05,
4166
+ "loss": 1.0571,
4167
+ "step": 594
4168
+ },
4169
+ {
4170
+ "epoch": 4.465290806754221,
4171
+ "grad_norm": 0.2503071129322052,
4172
+ "learning_rate": 1.488334340023669e-05,
4173
+ "loss": 1.0172,
4174
+ "step": 595
4175
+ },
4176
+ {
4177
+ "epoch": 4.472795497185741,
4178
+ "grad_norm": 0.2626873552799225,
4179
+ "learning_rate": 1.4608209723788835e-05,
4180
+ "loss": 1.036,
4181
+ "step": 596
4182
+ },
4183
+ {
4184
+ "epoch": 4.480300187617261,
4185
+ "grad_norm": 0.3006345331668854,
4186
+ "learning_rate": 1.4335444521911899e-05,
4187
+ "loss": 1.1155,
4188
+ "step": 597
4189
+ },
4190
+ {
4191
+ "epoch": 4.487804878048781,
4192
+ "grad_norm": 0.2929830849170685,
4193
+ "learning_rate": 1.4065055272601703e-05,
4194
+ "loss": 1.0768,
4195
+ "step": 598
4196
+ },
4197
+ {
4198
+ "epoch": 4.4953095684803,
4199
+ "grad_norm": 0.25911423563957214,
4200
+ "learning_rate": 1.3797049388716065e-05,
4201
+ "loss": 1.0368,
4202
+ "step": 599
4203
+ },
4204
+ {
4205
+ "epoch": 4.50281425891182,
4206
+ "grad_norm": 0.2785639464855194,
4207
+ "learning_rate": 1.3531434217771692e-05,
4208
+ "loss": 1.0747,
4209
+ "step": 600
4210
+ },
4211
+ {
4212
+ "epoch": 4.51031894934334,
4213
+ "grad_norm": 0.27841538190841675,
4214
+ "learning_rate": 1.3268217041742701e-05,
4215
+ "loss": 1.0116,
4216
+ "step": 601
4217
+ },
4218
+ {
4219
+ "epoch": 4.517823639774859,
4220
+ "grad_norm": 0.27080854773521423,
4221
+ "learning_rate": 1.3007405076860875e-05,
4222
+ "loss": 0.9667,
4223
+ "step": 602
4224
+ },
4225
+ {
4226
+ "epoch": 4.525328330206379,
4227
+ "grad_norm": 0.25426366925239563,
4228
+ "learning_rate": 1.2749005473418015e-05,
4229
+ "loss": 0.9367,
4230
+ "step": 603
4231
+ },
4232
+ {
4233
+ "epoch": 4.532833020637899,
4234
+ "grad_norm": 0.24661357700824738,
4235
+ "learning_rate": 1.2493025315569801e-05,
4236
+ "loss": 0.9841,
4237
+ "step": 604
4238
+ },
4239
+ {
4240
+ "epoch": 4.5403377110694185,
4241
+ "grad_norm": 0.2595573365688324,
4242
+ "learning_rate": 1.2239471621141508e-05,
4243
+ "loss": 0.9494,
4244
+ "step": 605
4245
+ },
4246
+ {
4247
+ "epoch": 4.547842401500938,
4248
+ "grad_norm": 0.27605584263801575,
4249
+ "learning_rate": 1.1988351341435792e-05,
4250
+ "loss": 1.0487,
4251
+ "step": 606
4252
+ },
4253
+ {
4254
+ "epoch": 4.5553470919324575,
4255
+ "grad_norm": 0.2632475197315216,
4256
+ "learning_rate": 1.173967136104196e-05,
4257
+ "loss": 1.0948,
4258
+ "step": 607
4259
+ },
4260
+ {
4261
+ "epoch": 4.562851782363977,
4262
+ "grad_norm": 0.2705388367176056,
4263
+ "learning_rate": 1.1493438497647313e-05,
4264
+ "loss": 1.0667,
4265
+ "step": 608
4266
+ },
4267
+ {
4268
+ "epoch": 4.570356472795497,
4269
+ "grad_norm": 0.25751256942749023,
4270
+ "learning_rate": 1.1249659501850155e-05,
4271
+ "loss": 1.1783,
4272
+ "step": 609
4273
+ },
4274
+ {
4275
+ "epoch": 4.577861163227017,
4276
+ "grad_norm": 0.2475949227809906,
4277
+ "learning_rate": 1.1008341056974854e-05,
4278
+ "loss": 1.0667,
4279
+ "step": 610
4280
+ },
4281
+ {
4282
+ "epoch": 4.585365853658536,
4283
+ "grad_norm": 0.26045867800712585,
4284
+ "learning_rate": 1.0769489778888405e-05,
4285
+ "loss": 0.9714,
4286
+ "step": 611
4287
+ },
4288
+ {
4289
+ "epoch": 4.592870544090056,
4290
+ "grad_norm": 0.2527018189430237,
4291
+ "learning_rate": 1.0533112215819298e-05,
4292
+ "loss": 1.0692,
4293
+ "step": 612
4294
+ },
4295
+ {
4296
+ "epoch": 4.600375234521576,
4297
+ "grad_norm": 0.25061407685279846,
4298
+ "learning_rate": 1.029921484817783e-05,
4299
+ "loss": 0.9363,
4300
+ "step": 613
4301
+ },
4302
+ {
4303
+ "epoch": 4.607879924953096,
4304
+ "grad_norm": 0.24939140677452087,
4305
+ "learning_rate": 1.0067804088378455e-05,
4306
+ "loss": 1.038,
4307
+ "step": 614
4308
+ },
4309
+ {
4310
+ "epoch": 4.615384615384615,
4311
+ "grad_norm": 0.2591976523399353,
4312
+ "learning_rate": 9.8388862806641e-06,
4313
+ "loss": 0.9335,
4314
+ "step": 615
4315
+ },
4316
+ {
4317
+ "epoch": 4.622889305816135,
4318
+ "grad_norm": 0.26032814383506775,
4319
+ "learning_rate": 9.612467700932045e-06,
4320
+ "loss": 1.0562,
4321
+ "step": 616
4322
+ },
4323
+ {
4324
+ "epoch": 4.630393996247655,
4325
+ "grad_norm": 0.2716333270072937,
4326
+ "learning_rate": 9.388554556562049e-06,
4327
+ "loss": 0.9728,
4328
+ "step": 617
4329
+ },
4330
+ {
4331
+ "epoch": 4.637898686679175,
4332
+ "grad_norm": 0.26602107286453247,
4333
+ "learning_rate": 9.167152986246078e-06,
4334
+ "loss": 0.9198,
4335
+ "step": 618
4336
+ },
4337
+ {
4338
+ "epoch": 4.645403377110695,
4339
+ "grad_norm": 0.27277711033821106,
4340
+ "learning_rate": 8.948269059820025e-06,
4341
+ "loss": 1.0677,
4342
+ "step": 619
4343
+ },
4344
+ {
4345
+ "epoch": 4.652908067542214,
4346
+ "grad_norm": 0.2933201491832733,
4347
+ "learning_rate": 8.731908778097302e-06,
4348
+ "loss": 1.0856,
4349
+ "step": 620
4350
+ },
4351
+ {
4352
+ "epoch": 4.6604127579737336,
4353
+ "grad_norm": 0.26761552691459656,
4354
+ "learning_rate": 8.518078072704338e-06,
4355
+ "loss": 0.9855,
4356
+ "step": 621
4357
+ },
4358
+ {
4359
+ "epoch": 4.6679174484052535,
4360
+ "grad_norm": 0.26589658856391907,
4361
+ "learning_rate": 8.306782805917904e-06,
4362
+ "loss": 1.1391,
4363
+ "step": 622
4364
+ },
4365
+ {
4366
+ "epoch": 4.6754221388367725,
4367
+ "grad_norm": 0.245664581656456,
4368
+ "learning_rate": 8.098028770504494e-06,
4369
+ "loss": 1.0441,
4370
+ "step": 623
4371
+ },
4372
+ {
4373
+ "epoch": 4.682926829268292,
4374
+ "grad_norm": 0.25962963700294495,
4375
+ "learning_rate": 7.891821689561459e-06,
4376
+ "loss": 0.9938,
4377
+ "step": 624
4378
+ },
4379
+ {
4380
+ "epoch": 4.690431519699812,
4381
+ "grad_norm": 0.2487301230430603,
4382
+ "learning_rate": 7.68816721636004e-06,
4383
+ "loss": 1.0108,
4384
+ "step": 625
4385
+ },
4386
+ {
4387
+ "epoch": 4.697936210131332,
4388
+ "grad_norm": 0.2539227604866028,
4389
+ "learning_rate": 7.487070934190532e-06,
4390
+ "loss": 1.0537,
4391
+ "step": 626
4392
+ },
4393
+ {
4394
+ "epoch": 4.705440900562852,
4395
+ "grad_norm": 0.2824779152870178,
4396
+ "learning_rate": 7.288538356209092e-06,
4397
+ "loss": 1.0141,
4398
+ "step": 627
4399
+ },
4400
+ {
4401
+ "epoch": 4.712945590994371,
4402
+ "grad_norm": 0.24700817465782166,
4403
+ "learning_rate": 7.092574925286614e-06,
4404
+ "loss": 0.8957,
4405
+ "step": 628
4406
+ },
4407
+ {
4408
+ "epoch": 4.720450281425891,
4409
+ "grad_norm": 0.23364083468914032,
4410
+ "learning_rate": 6.899186013859561e-06,
4411
+ "loss": 1.0811,
4412
+ "step": 629
4413
+ },
4414
+ {
4415
+ "epoch": 4.727954971857411,
4416
+ "grad_norm": 0.25310617685317993,
4417
+ "learning_rate": 6.708376923782635e-06,
4418
+ "loss": 1.0577,
4419
+ "step": 630
4420
+ },
4421
+ {
4422
+ "epoch": 4.735459662288931,
4423
+ "grad_norm": 0.2497323900461197,
4424
+ "learning_rate": 6.520152886183406e-06,
4425
+ "loss": 0.9827,
4426
+ "step": 631
4427
+ },
4428
+ {
4429
+ "epoch": 4.74296435272045,
4430
+ "grad_norm": 0.27253395318984985,
4431
+ "learning_rate": 6.3345190613189635e-06,
4432
+ "loss": 0.963,
4433
+ "step": 632
4434
+ },
4435
+ {
4436
+ "epoch": 4.75046904315197,
4437
+ "grad_norm": 0.25061896443367004,
4438
+ "learning_rate": 6.151480538434382e-06,
4439
+ "loss": 1.1357,
4440
+ "step": 633
4441
+ },
4442
+ {
4443
+ "epoch": 4.75797373358349,
4444
+ "grad_norm": 0.25422045588493347,
4445
+ "learning_rate": 5.971042335623229e-06,
4446
+ "loss": 0.9789,
4447
+ "step": 634
4448
+ },
4449
+ {
4450
+ "epoch": 4.76547842401501,
4451
+ "grad_norm": 0.28153881430625916,
4452
+ "learning_rate": 5.793209399689978e-06,
4453
+ "loss": 1.0069,
4454
+ "step": 635
4455
+ },
4456
+ {
4457
+ "epoch": 4.772983114446529,
4458
+ "grad_norm": 0.26730191707611084,
4459
+ "learning_rate": 5.617986606014419e-06,
4460
+ "loss": 0.9413,
4461
+ "step": 636
4462
+ },
4463
+ {
4464
+ "epoch": 4.780487804878049,
4465
+ "grad_norm": 0.25754016637802124,
4466
+ "learning_rate": 5.445378758417925e-06,
4467
+ "loss": 0.967,
4468
+ "step": 637
4469
+ },
4470
+ {
4471
+ "epoch": 4.7879924953095685,
4472
+ "grad_norm": 0.26099568605422974,
4473
+ "learning_rate": 5.275390589031859e-06,
4474
+ "loss": 1.0104,
4475
+ "step": 638
4476
+ },
4477
+ {
4478
+ "epoch": 4.795497185741088,
4479
+ "grad_norm": 0.27538684010505676,
4480
+ "learning_rate": 5.108026758167719e-06,
4481
+ "loss": 1.0233,
4482
+ "step": 639
4483
+ },
4484
+ {
4485
+ "epoch": 4.803001876172608,
4486
+ "grad_norm": 0.26298317313194275,
4487
+ "learning_rate": 4.943291854189493e-06,
4488
+ "loss": 0.9756,
4489
+ "step": 640
4490
+ },
4491
+ {
4492
+ "epoch": 4.810506566604127,
4493
+ "grad_norm": 0.2920237183570862,
4494
+ "learning_rate": 4.781190393387796e-06,
4495
+ "loss": 0.985,
4496
+ "step": 641
4497
+ },
4498
+ {
4499
+ "epoch": 4.818011257035647,
4500
+ "grad_norm": 0.2614547908306122,
4501
+ "learning_rate": 4.6217268198560404e-06,
4502
+ "loss": 1.0421,
4503
+ "step": 642
4504
+ },
4505
+ {
4506
+ "epoch": 4.825515947467167,
4507
+ "grad_norm": 0.2709522843360901,
4508
+ "learning_rate": 4.464905505368658e-06,
4509
+ "loss": 1.0479,
4510
+ "step": 643
4511
+ },
4512
+ {
4513
+ "epoch": 4.833020637898686,
4514
+ "grad_norm": 0.2588852047920227,
4515
+ "learning_rate": 4.3107307492612086e-06,
4516
+ "loss": 0.8586,
4517
+ "step": 644
4518
+ },
4519
+ {
4520
+ "epoch": 4.840525328330206,
4521
+ "grad_norm": 0.2764037549495697,
4522
+ "learning_rate": 4.1592067783125015e-06,
4523
+ "loss": 1.0139,
4524
+ "step": 645
4525
+ },
4526
+ {
4527
+ "epoch": 4.848030018761726,
4528
+ "grad_norm": 0.2646092474460602,
4529
+ "learning_rate": 4.010337746628751e-06,
4530
+ "loss": 0.8966,
4531
+ "step": 646
4532
+ },
4533
+ {
4534
+ "epoch": 4.855534709193246,
4535
+ "grad_norm": 0.26426824927330017,
4536
+ "learning_rate": 3.864127735529656e-06,
4537
+ "loss": 1.0248,
4538
+ "step": 647
4539
+ },
4540
+ {
4541
+ "epoch": 4.863039399624766,
4542
+ "grad_norm": 0.2680165767669678,
4543
+ "learning_rate": 3.7205807534365315e-06,
4544
+ "loss": 1.002,
4545
+ "step": 648
4546
+ },
4547
+ {
4548
+ "epoch": 4.870544090056285,
4549
+ "grad_norm": 0.27381497621536255,
4550
+ "learning_rate": 3.5797007357623945e-06,
4551
+ "loss": 1.005,
4552
+ "step": 649
4553
+ },
4554
+ {
4555
+ "epoch": 4.878048780487805,
4556
+ "grad_norm": 0.24401965737342834,
4557
+ "learning_rate": 3.441491544804112e-06,
4558
+ "loss": 0.9504,
4559
+ "step": 650
4560
+ },
4561
+ {
4562
+ "epoch": 4.885553470919325,
4563
+ "grad_norm": 0.28096142411231995,
4564
+ "learning_rate": 3.3059569696364502e-06,
4565
+ "loss": 1.0629,
4566
+ "step": 651
4567
+ },
4568
+ {
4569
+ "epoch": 4.893058161350845,
4570
+ "grad_norm": 0.25856783986091614,
4571
+ "learning_rate": 3.1731007260082616e-06,
4572
+ "loss": 1.0072,
4573
+ "step": 652
4574
+ },
4575
+ {
4576
+ "epoch": 4.900562851782364,
4577
+ "grad_norm": 0.25341498851776123,
4578
+ "learning_rate": 3.0429264562405776e-06,
4579
+ "loss": 1.0113,
4580
+ "step": 653
4581
+ },
4582
+ {
4583
+ "epoch": 4.9080675422138835,
4584
+ "grad_norm": 0.2537418007850647,
4585
+ "learning_rate": 2.9154377291267674e-06,
4586
+ "loss": 1.0907,
4587
+ "step": 654
4588
+ },
4589
+ {
4590
+ "epoch": 4.915572232645403,
4591
+ "grad_norm": 0.252174973487854,
4592
+ "learning_rate": 2.790638039834668e-06,
4593
+ "loss": 0.9687,
4594
+ "step": 655
4595
+ },
4596
+ {
4597
+ "epoch": 4.923076923076923,
4598
+ "grad_norm": 0.26291623711586,
4599
+ "learning_rate": 2.6685308098108106e-06,
4600
+ "loss": 1.0686,
4601
+ "step": 656
4602
+ },
4603
+ {
4604
+ "epoch": 4.930581613508442,
4605
+ "grad_norm": 0.2872166037559509,
4606
+ "learning_rate": 2.5491193866866025e-06,
4607
+ "loss": 0.9811,
4608
+ "step": 657
4609
+ },
4610
+ {
4611
+ "epoch": 4.938086303939962,
4612
+ "grad_norm": 0.28239676356315613,
4613
+ "learning_rate": 2.432407044186509e-06,
4614
+ "loss": 1.0442,
4615
+ "step": 658
4616
+ },
4617
+ {
4618
+ "epoch": 4.945590994371482,
4619
+ "grad_norm": 0.2630953788757324,
4620
+ "learning_rate": 2.3183969820383735e-06,
4621
+ "loss": 0.9368,
4622
+ "step": 659
4623
+ },
4624
+ {
4625
+ "epoch": 4.953095684803002,
4626
+ "grad_norm": 0.2792162597179413,
4627
+ "learning_rate": 2.2070923258856255e-06,
4628
+ "loss": 1.1048,
4629
+ "step": 660
4630
+ },
4631
+ {
4632
+ "epoch": 4.960600375234522,
4633
+ "grad_norm": 0.2646627724170685,
4634
+ "learning_rate": 2.098496127201648e-06,
4635
+ "loss": 0.9354,
4636
+ "step": 661
4637
+ },
4638
+ {
4639
+ "epoch": 4.968105065666041,
4640
+ "grad_norm": 0.2521073818206787,
4641
+ "learning_rate": 1.992611363206103e-06,
4642
+ "loss": 1.0318,
4643
+ "step": 662
4644
+ },
4645
+ {
4646
+ "epoch": 4.975609756097561,
4647
+ "grad_norm": 0.27099478244781494,
4648
+ "learning_rate": 1.889440936783242e-06,
4649
+ "loss": 1.0485,
4650
+ "step": 663
4651
+ },
4652
+ {
4653
+ "epoch": 4.983114446529081,
4654
+ "grad_norm": 0.2716807425022125,
4655
+ "learning_rate": 1.7889876764024505e-06,
4656
+ "loss": 0.9949,
4657
+ "step": 664
4658
+ },
4659
+ {
4660
+ "epoch": 4.9906191369606,
4661
+ "grad_norm": 0.2702957093715668,
4662
+ "learning_rate": 1.691254336040595e-06,
4663
+ "loss": 1.11,
4664
+ "step": 665
4665
+ },
4666
+ {
4667
+ "epoch": 4.99812382739212,
4668
+ "grad_norm": 0.2564277648925781,
4669
+ "learning_rate": 1.59624359510657e-06,
4670
+ "loss": 0.8788,
4671
+ "step": 666
4672
  }
4673
  ],
4674
  "logging_steps": 1,
 
4688
  "attributes": {}
4689
  }
4690
  },
4691
+ "total_flos": 1.4625303974966723e+18,
4692
  "train_batch_size": 6,
4693
  "trial_name": null,
4694
  "trial_params": null