evgmaslov commited on
Commit
e131bdb
·
verified ·
1 Parent(s): dab3dcd

Training in progress, epoch 9, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8248ceafbddeeaf0df4157c3f181132dfa26a631f3ff91bd3e0a9ef5fa3f1569
3
  size 201352688
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:90a3f56c55cef5851e6e24ff2b7b8fa38c8acb2aefbe1748255e5c7947d86a90
3
  size 201352688
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4b922fcdd30412293c1c1779e58026219228bda4ab3b2774e95d2071abc2b3fc
3
  size 402815162
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff416b710d3e4584a937232df5b16b2a162f2e5b98bee596b744dc41388136a1
3
  size 402815162
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5e2736cb34d10c46f7acc0e360b64ff520ba91b517f3c7a6c3ba907cc8d212d7
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2dfdbe0760a458a4a4179393e5eabb7411f4ee7f6fa21362c3a9a65d716108a8
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:95bb70dcc716030f970fe717af8ff186a0b99796d4344bf7533bdf257cd938e9
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d7c1449d2327688a19dc22c5f7f05a942f96806eecbd37990c97f51073c6b8d
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 8.0,
5
  "eval_steps": 1,
6
- "global_step": 1936,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -15567,6 +15567,1951 @@
15567
  "eval_samples_per_second": 23.971,
15568
  "eval_steps_per_second": 4.794,
15569
  "step": 1936
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15570
  }
15571
  ],
15572
  "logging_steps": 1,
@@ -15586,7 +17531,7 @@
15586
  "attributes": {}
15587
  }
15588
  },
15589
- "total_flos": 8.954025232590766e+17,
15590
  "train_batch_size": 2,
15591
  "trial_name": null,
15592
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 9.0,
5
  "eval_steps": 1,
6
+ "global_step": 2178,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
15567
  "eval_samples_per_second": 23.971,
15568
  "eval_steps_per_second": 4.794,
15569
  "step": 1936
15570
+ },
15571
+ {
15572
+ "epoch": 8.00413223140496,
15573
+ "grad_norm": 0.05322222039103508,
15574
+ "learning_rate": 1.0501045177070335e-06,
15575
+ "loss": 0.2053,
15576
+ "mean_token_accuracy": 0.9290907979011536,
15577
+ "step": 1937
15578
+ },
15579
+ {
15580
+ "epoch": 8.008264462809917,
15581
+ "grad_norm": 0.07542014122009277,
15582
+ "learning_rate": 1.0459189581655864e-06,
15583
+ "loss": 0.2204,
15584
+ "mean_token_accuracy": 0.9238471388816833,
15585
+ "step": 1938
15586
+ },
15587
+ {
15588
+ "epoch": 8.012396694214877,
15589
+ "grad_norm": 0.07172047346830368,
15590
+ "learning_rate": 1.0417407822095266e-06,
15591
+ "loss": 0.2185,
15592
+ "mean_token_accuracy": 0.9246459007263184,
15593
+ "step": 1939
15594
+ },
15595
+ {
15596
+ "epoch": 8.016528925619834,
15597
+ "grad_norm": 0.07066302001476288,
15598
+ "learning_rate": 1.037569997640896e-06,
15599
+ "loss": 0.1789,
15600
+ "mean_token_accuracy": 0.9409568905830383,
15601
+ "step": 1940
15602
+ },
15603
+ {
15604
+ "epoch": 8.020661157024794,
15605
+ "grad_norm": 0.07975345849990845,
15606
+ "learning_rate": 1.0334066122479403e-06,
15607
+ "loss": 0.2232,
15608
+ "mean_token_accuracy": 0.9241645336151123,
15609
+ "step": 1941
15610
+ },
15611
+ {
15612
+ "epoch": 8.024793388429751,
15613
+ "grad_norm": 0.06827311962842941,
15614
+ "learning_rate": 1.0292506338050834e-06,
15615
+ "loss": 0.1683,
15616
+ "mean_token_accuracy": 0.9441187381744385,
15617
+ "step": 1942
15618
+ },
15619
+ {
15620
+ "epoch": 8.02892561983471,
15621
+ "grad_norm": 0.08944051712751389,
15622
+ "learning_rate": 1.0251020700729209e-06,
15623
+ "loss": 0.1975,
15624
+ "mean_token_accuracy": 0.9320717453956604,
15625
+ "step": 1943
15626
+ },
15627
+ {
15628
+ "epoch": 8.03305785123967,
15629
+ "grad_norm": 0.09352165460586548,
15630
+ "learning_rate": 1.0209609287982047e-06,
15631
+ "loss": 0.1943,
15632
+ "mean_token_accuracy": 0.9347447752952576,
15633
+ "step": 1944
15634
+ },
15635
+ {
15636
+ "epoch": 8.037190082644628,
15637
+ "grad_norm": 0.07383626699447632,
15638
+ "learning_rate": 1.01682721771382e-06,
15639
+ "loss": 0.1209,
15640
+ "mean_token_accuracy": 0.9616514444351196,
15641
+ "step": 1945
15642
+ },
15643
+ {
15644
+ "epoch": 8.041322314049587,
15645
+ "grad_norm": 0.08939805626869202,
15646
+ "learning_rate": 1.0127009445387836e-06,
15647
+ "loss": 0.1748,
15648
+ "mean_token_accuracy": 0.9405485391616821,
15649
+ "step": 1946
15650
+ },
15651
+ {
15652
+ "epoch": 8.045454545454545,
15653
+ "grad_norm": 0.0925152450799942,
15654
+ "learning_rate": 1.00858211697822e-06,
15655
+ "loss": 0.1767,
15656
+ "mean_token_accuracy": 0.9380128979682922,
15657
+ "step": 1947
15658
+ },
15659
+ {
15660
+ "epoch": 8.049586776859504,
15661
+ "grad_norm": 0.07704450935125351,
15662
+ "learning_rate": 1.004470742723353e-06,
15663
+ "loss": 0.1219,
15664
+ "mean_token_accuracy": 0.965753436088562,
15665
+ "step": 1948
15666
+ },
15667
+ {
15668
+ "epoch": 8.053719008264462,
15669
+ "grad_norm": 0.08180603384971619,
15670
+ "learning_rate": 1.0003668294514845e-06,
15671
+ "loss": 0.1327,
15672
+ "mean_token_accuracy": 0.9580827355384827,
15673
+ "step": 1949
15674
+ },
15675
+ {
15676
+ "epoch": 8.057851239669422,
15677
+ "grad_norm": 0.08901241421699524,
15678
+ "learning_rate": 9.962703848259887e-07,
15679
+ "loss": 0.1494,
15680
+ "mean_token_accuracy": 0.9514312148094177,
15681
+ "step": 1950
15682
+ },
15683
+ {
15684
+ "epoch": 8.061983471074381,
15685
+ "grad_norm": 0.07941275835037231,
15686
+ "learning_rate": 9.921814164962878e-07,
15687
+ "loss": 0.103,
15688
+ "mean_token_accuracy": 0.9707224369049072,
15689
+ "step": 1951
15690
+ },
15691
+ {
15692
+ "epoch": 8.066115702479339,
15693
+ "grad_norm": 0.09194760769605637,
15694
+ "learning_rate": 9.880999320978495e-07,
15695
+ "loss": 0.1495,
15696
+ "mean_token_accuracy": 0.9521530866622925,
15697
+ "step": 1952
15698
+ },
15699
+ {
15700
+ "epoch": 8.070247933884298,
15701
+ "grad_norm": 0.09634792059659958,
15702
+ "learning_rate": 9.84025939252164e-07,
15703
+ "loss": 0.1544,
15704
+ "mean_token_accuracy": 0.9550842046737671,
15705
+ "step": 1953
15706
+ },
15707
+ {
15708
+ "epoch": 8.074380165289256,
15709
+ "grad_norm": 0.09481213241815567,
15710
+ "learning_rate": 9.799594455667293e-07,
15711
+ "loss": 0.1367,
15712
+ "mean_token_accuracy": 0.9575821757316589,
15713
+ "step": 1954
15714
+ },
15715
+ {
15716
+ "epoch": 8.078512396694215,
15717
+ "grad_norm": 0.09061330556869507,
15718
+ "learning_rate": 9.759004586350456e-07,
15719
+ "loss": 0.0852,
15720
+ "mean_token_accuracy": 0.9757155179977417,
15721
+ "step": 1955
15722
+ },
15723
+ {
15724
+ "epoch": 8.082644628099173,
15725
+ "grad_norm": 0.09262175112962723,
15726
+ "learning_rate": 9.718489860365882e-07,
15727
+ "loss": 0.0997,
15728
+ "mean_token_accuracy": 0.9744042158126831,
15729
+ "step": 1956
15730
+ },
15731
+ {
15732
+ "epoch": 8.086776859504132,
15733
+ "grad_norm": 0.07662644982337952,
15734
+ "learning_rate": 9.678050353368106e-07,
15735
+ "loss": 0.093,
15736
+ "mean_token_accuracy": 0.9729189872741699,
15737
+ "step": 1957
15738
+ },
15739
+ {
15740
+ "epoch": 8.090909090909092,
15741
+ "grad_norm": 0.1026345044374466,
15742
+ "learning_rate": 9.637686140871121e-07,
15743
+ "loss": 0.1111,
15744
+ "mean_token_accuracy": 0.9670698642730713,
15745
+ "step": 1958
15746
+ },
15747
+ {
15748
+ "epoch": 8.09504132231405,
15749
+ "grad_norm": 0.09648612886667252,
15750
+ "learning_rate": 9.59739729824833e-07,
15751
+ "loss": 0.1283,
15752
+ "mean_token_accuracy": 0.9622212052345276,
15753
+ "step": 1959
15754
+ },
15755
+ {
15756
+ "epoch": 8.099173553719009,
15757
+ "grad_norm": 0.09148698300123215,
15758
+ "learning_rate": 9.557183900732425e-07,
15759
+ "loss": 0.0953,
15760
+ "mean_token_accuracy": 0.9743001461029053,
15761
+ "step": 1960
15762
+ },
15763
+ {
15764
+ "epoch": 8.103305785123966,
15765
+ "grad_norm": 0.07736257463693619,
15766
+ "learning_rate": 9.517046023415205e-07,
15767
+ "loss": 0.079,
15768
+ "mean_token_accuracy": 0.9799261689186096,
15769
+ "step": 1961
15770
+ },
15771
+ {
15772
+ "epoch": 8.107438016528926,
15773
+ "grad_norm": 0.0846625566482544,
15774
+ "learning_rate": 9.476983741247464e-07,
15775
+ "loss": 0.0875,
15776
+ "mean_token_accuracy": 0.9742388725280762,
15777
+ "step": 1962
15778
+ },
15779
+ {
15780
+ "epoch": 8.111570247933884,
15781
+ "grad_norm": 0.10327938944101334,
15782
+ "learning_rate": 9.436997129038783e-07,
15783
+ "loss": 0.1394,
15784
+ "mean_token_accuracy": 0.957582950592041,
15785
+ "step": 1963
15786
+ },
15787
+ {
15788
+ "epoch": 8.115702479338843,
15789
+ "grad_norm": 0.0965140238404274,
15790
+ "learning_rate": 9.397086261457511e-07,
15791
+ "loss": 0.112,
15792
+ "mean_token_accuracy": 0.9647870659828186,
15793
+ "step": 1964
15794
+ },
15795
+ {
15796
+ "epoch": 8.119834710743802,
15797
+ "grad_norm": 0.09479817748069763,
15798
+ "learning_rate": 9.357251213030489e-07,
15799
+ "loss": 0.0908,
15800
+ "mean_token_accuracy": 0.973259449005127,
15801
+ "step": 1965
15802
+ },
15803
+ {
15804
+ "epoch": 8.12396694214876,
15805
+ "grad_norm": 0.08082997798919678,
15806
+ "learning_rate": 9.317492058143024e-07,
15807
+ "loss": 0.0831,
15808
+ "mean_token_accuracy": 0.9776373505592346,
15809
+ "step": 1966
15810
+ },
15811
+ {
15812
+ "epoch": 8.12809917355372,
15813
+ "grad_norm": 0.0902785211801529,
15814
+ "learning_rate": 9.277808871038713e-07,
15815
+ "loss": 0.0966,
15816
+ "mean_token_accuracy": 0.9756577610969543,
15817
+ "step": 1967
15818
+ },
15819
+ {
15820
+ "epoch": 8.132231404958677,
15821
+ "grad_norm": 0.0910555049777031,
15822
+ "learning_rate": 9.238201725819235e-07,
15823
+ "loss": 0.1005,
15824
+ "mean_token_accuracy": 0.9727723002433777,
15825
+ "step": 1968
15826
+ },
15827
+ {
15828
+ "epoch": 8.136363636363637,
15829
+ "grad_norm": 0.08586708456277847,
15830
+ "learning_rate": 9.198670696444339e-07,
15831
+ "loss": 0.0843,
15832
+ "mean_token_accuracy": 0.9773631691932678,
15833
+ "step": 1969
15834
+ },
15835
+ {
15836
+ "epoch": 8.140495867768594,
15837
+ "grad_norm": 0.11421328037977219,
15838
+ "learning_rate": 9.159215856731607e-07,
15839
+ "loss": 0.0998,
15840
+ "mean_token_accuracy": 0.9735649824142456,
15841
+ "step": 1970
15842
+ },
15843
+ {
15844
+ "epoch": 8.144628099173554,
15845
+ "grad_norm": 0.10374422371387482,
15846
+ "learning_rate": 9.11983728035637e-07,
15847
+ "loss": 0.0972,
15848
+ "mean_token_accuracy": 0.9747347235679626,
15849
+ "step": 1971
15850
+ },
15851
+ {
15852
+ "epoch": 8.148760330578513,
15853
+ "grad_norm": 0.13511402904987335,
15854
+ "learning_rate": 9.080535040851518e-07,
15855
+ "loss": 0.2081,
15856
+ "mean_token_accuracy": 0.9311926364898682,
15857
+ "step": 1972
15858
+ },
15859
+ {
15860
+ "epoch": 8.152892561983471,
15861
+ "grad_norm": 0.10286667943000793,
15862
+ "learning_rate": 9.04130921160743e-07,
15863
+ "loss": 0.1673,
15864
+ "mean_token_accuracy": 0.9480319023132324,
15865
+ "step": 1973
15866
+ },
15867
+ {
15868
+ "epoch": 8.15702479338843,
15869
+ "grad_norm": 0.08978980779647827,
15870
+ "learning_rate": 9.002159865871762e-07,
15871
+ "loss": 0.0977,
15872
+ "mean_token_accuracy": 0.9703608155250549,
15873
+ "step": 1974
15874
+ },
15875
+ {
15876
+ "epoch": 8.161157024793388,
15877
+ "grad_norm": 0.12502841651439667,
15878
+ "learning_rate": 8.963087076749389e-07,
15879
+ "loss": 0.145,
15880
+ "mean_token_accuracy": 0.9607588648796082,
15881
+ "step": 1975
15882
+ },
15883
+ {
15884
+ "epoch": 8.165289256198347,
15885
+ "grad_norm": 0.10160111635923386,
15886
+ "learning_rate": 8.924090917202228e-07,
15887
+ "loss": 0.0783,
15888
+ "mean_token_accuracy": 0.9801255464553833,
15889
+ "step": 1976
15890
+ },
15891
+ {
15892
+ "epoch": 8.169421487603305,
15893
+ "grad_norm": 0.1070442870259285,
15894
+ "learning_rate": 8.885171460049058e-07,
15895
+ "loss": 0.0906,
15896
+ "mean_token_accuracy": 0.973698079586029,
15897
+ "step": 1977
15898
+ },
15899
+ {
15900
+ "epoch": 8.173553719008265,
15901
+ "grad_norm": 0.09609609842300415,
15902
+ "learning_rate": 8.846328777965468e-07,
15903
+ "loss": 0.0893,
15904
+ "mean_token_accuracy": 0.9760192036628723,
15905
+ "step": 1978
15906
+ },
15907
+ {
15908
+ "epoch": 8.177685950413224,
15909
+ "grad_norm": 0.10213906317949295,
15910
+ "learning_rate": 8.807562943483683e-07,
15911
+ "loss": 0.0904,
15912
+ "mean_token_accuracy": 0.9757412672042847,
15913
+ "step": 1979
15914
+ },
15915
+ {
15916
+ "epoch": 8.181818181818182,
15917
+ "grad_norm": 0.09828820079565048,
15918
+ "learning_rate": 8.768874028992431e-07,
15919
+ "loss": 0.0897,
15920
+ "mean_token_accuracy": 0.9777717590332031,
15921
+ "step": 1980
15922
+ },
15923
+ {
15924
+ "epoch": 8.185950413223141,
15925
+ "grad_norm": 0.09995172917842865,
15926
+ "learning_rate": 8.730262106736775e-07,
15927
+ "loss": 0.0848,
15928
+ "mean_token_accuracy": 0.9790863990783691,
15929
+ "step": 1981
15930
+ },
15931
+ {
15932
+ "epoch": 8.190082644628099,
15933
+ "grad_norm": 0.09710147231817245,
15934
+ "learning_rate": 8.691727248818016e-07,
15935
+ "loss": 0.0926,
15936
+ "mean_token_accuracy": 0.9745739102363586,
15937
+ "step": 1982
15938
+ },
15939
+ {
15940
+ "epoch": 8.194214876033058,
15941
+ "grad_norm": 0.10639967769384384,
15942
+ "learning_rate": 8.65326952719357e-07,
15943
+ "loss": 0.0934,
15944
+ "mean_token_accuracy": 0.9727582335472107,
15945
+ "step": 1983
15946
+ },
15947
+ {
15948
+ "epoch": 8.198347107438016,
15949
+ "grad_norm": 0.10266918689012527,
15950
+ "learning_rate": 8.614889013676803e-07,
15951
+ "loss": 0.0922,
15952
+ "mean_token_accuracy": 0.9762585759162903,
15953
+ "step": 1984
15954
+ },
15955
+ {
15956
+ "epoch": 8.202479338842975,
15957
+ "grad_norm": 0.10627970099449158,
15958
+ "learning_rate": 8.576585779936924e-07,
15959
+ "loss": 0.0983,
15960
+ "mean_token_accuracy": 0.9713375568389893,
15961
+ "step": 1985
15962
+ },
15963
+ {
15964
+ "epoch": 8.206611570247935,
15965
+ "grad_norm": 0.12172595411539078,
15966
+ "learning_rate": 8.538359897498793e-07,
15967
+ "loss": 0.1657,
15968
+ "mean_token_accuracy": 0.9538551568984985,
15969
+ "step": 1986
15970
+ },
15971
+ {
15972
+ "epoch": 8.210743801652892,
15973
+ "grad_norm": 0.06621988117694855,
15974
+ "learning_rate": 8.500211437742878e-07,
15975
+ "loss": 0.2023,
15976
+ "mean_token_accuracy": 0.9292741417884827,
15977
+ "step": 1987
15978
+ },
15979
+ {
15980
+ "epoch": 8.214876033057852,
15981
+ "grad_norm": 0.06165366619825363,
15982
+ "learning_rate": 8.462140471905034e-07,
15983
+ "loss": 0.1687,
15984
+ "mean_token_accuracy": 0.9438784718513489,
15985
+ "step": 1988
15986
+ },
15987
+ {
15988
+ "epoch": 8.21900826446281,
15989
+ "grad_norm": 0.0728682428598404,
15990
+ "learning_rate": 8.424147071076427e-07,
15991
+ "loss": 0.1858,
15992
+ "mean_token_accuracy": 0.9376370906829834,
15993
+ "step": 1989
15994
+ },
15995
+ {
15996
+ "epoch": 8.223140495867769,
15997
+ "grad_norm": 0.0772644579410553,
15998
+ "learning_rate": 8.386231306203402e-07,
15999
+ "loss": 0.2246,
16000
+ "mean_token_accuracy": 0.924739420413971,
16001
+ "step": 1990
16002
+ },
16003
+ {
16004
+ "epoch": 8.227272727272727,
16005
+ "grad_norm": 0.09445520490407944,
16006
+ "learning_rate": 8.348393248087289e-07,
16007
+ "loss": 0.2536,
16008
+ "mean_token_accuracy": 0.9166355133056641,
16009
+ "step": 1991
16010
+ },
16011
+ {
16012
+ "epoch": 8.231404958677686,
16013
+ "grad_norm": 0.08564960211515427,
16014
+ "learning_rate": 8.310632967384341e-07,
16015
+ "loss": 0.2014,
16016
+ "mean_token_accuracy": 0.9372698664665222,
16017
+ "step": 1992
16018
+ },
16019
+ {
16020
+ "epoch": 8.235537190082646,
16021
+ "grad_norm": 0.09126199036836624,
16022
+ "learning_rate": 8.272950534605573e-07,
16023
+ "loss": 0.2703,
16024
+ "mean_token_accuracy": 0.9054905772209167,
16025
+ "step": 1993
16026
+ },
16027
+ {
16028
+ "epoch": 8.239669421487603,
16029
+ "grad_norm": 0.07200663536787033,
16030
+ "learning_rate": 8.235346020116647e-07,
16031
+ "loss": 0.1584,
16032
+ "mean_token_accuracy": 0.9474515914916992,
16033
+ "step": 1994
16034
+ },
16035
+ {
16036
+ "epoch": 8.243801652892563,
16037
+ "grad_norm": 0.07638365030288696,
16038
+ "learning_rate": 8.197819494137677e-07,
16039
+ "loss": 0.1827,
16040
+ "mean_token_accuracy": 0.9400560259819031,
16041
+ "step": 1995
16042
+ },
16043
+ {
16044
+ "epoch": 8.24793388429752,
16045
+ "grad_norm": 0.07526237517595291,
16046
+ "learning_rate": 8.160371026743202e-07,
16047
+ "loss": 0.1255,
16048
+ "mean_token_accuracy": 0.9592936038970947,
16049
+ "step": 1996
16050
+ },
16051
+ {
16052
+ "epoch": 8.25206611570248,
16053
+ "grad_norm": 0.09531582146883011,
16054
+ "learning_rate": 8.123000687861959e-07,
16055
+ "loss": 0.2096,
16056
+ "mean_token_accuracy": 0.932683527469635,
16057
+ "step": 1997
16058
+ },
16059
+ {
16060
+ "epoch": 8.256198347107437,
16061
+ "grad_norm": 0.09303406625986099,
16062
+ "learning_rate": 8.08570854727681e-07,
16063
+ "loss": 0.2018,
16064
+ "mean_token_accuracy": 0.9328662157058716,
16065
+ "step": 1998
16066
+ },
16067
+ {
16068
+ "epoch": 8.260330578512397,
16069
+ "grad_norm": 0.08916998654603958,
16070
+ "learning_rate": 8.048494674624613e-07,
16071
+ "loss": 0.1273,
16072
+ "mean_token_accuracy": 0.9591379165649414,
16073
+ "step": 1999
16074
+ },
16075
+ {
16076
+ "epoch": 8.264462809917354,
16077
+ "grad_norm": 0.08668152987957001,
16078
+ "learning_rate": 8.01135913939603e-07,
16079
+ "loss": 0.137,
16080
+ "mean_token_accuracy": 0.955795168876648,
16081
+ "step": 2000
16082
+ },
16083
+ {
16084
+ "epoch": 8.268595041322314,
16085
+ "grad_norm": 0.08069667220115662,
16086
+ "learning_rate": 7.97430201093547e-07,
16087
+ "loss": 0.1229,
16088
+ "mean_token_accuracy": 0.9618644118309021,
16089
+ "step": 2001
16090
+ },
16091
+ {
16092
+ "epoch": 8.272727272727273,
16093
+ "grad_norm": 0.09162264317274094,
16094
+ "learning_rate": 7.937323358440935e-07,
16095
+ "loss": 0.1378,
16096
+ "mean_token_accuracy": 0.9560089707374573,
16097
+ "step": 2002
16098
+ },
16099
+ {
16100
+ "epoch": 8.276859504132231,
16101
+ "grad_norm": 0.07887725532054901,
16102
+ "learning_rate": 7.90042325096389e-07,
16103
+ "loss": 0.0963,
16104
+ "mean_token_accuracy": 0.9715953469276428,
16105
+ "step": 2003
16106
+ },
16107
+ {
16108
+ "epoch": 8.28099173553719,
16109
+ "grad_norm": 0.10365016013383865,
16110
+ "learning_rate": 7.863601757409095e-07,
16111
+ "loss": 0.166,
16112
+ "mean_token_accuracy": 0.9484246373176575,
16113
+ "step": 2004
16114
+ },
16115
+ {
16116
+ "epoch": 8.285123966942148,
16117
+ "grad_norm": 0.10023301839828491,
16118
+ "learning_rate": 7.826858946534532e-07,
16119
+ "loss": 0.1623,
16120
+ "mean_token_accuracy": 0.9505438208580017,
16121
+ "step": 2005
16122
+ },
16123
+ {
16124
+ "epoch": 8.289256198347108,
16125
+ "grad_norm": 0.08399416506290436,
16126
+ "learning_rate": 7.790194886951268e-07,
16127
+ "loss": 0.0924,
16128
+ "mean_token_accuracy": 0.9722627997398376,
16129
+ "step": 2006
16130
+ },
16131
+ {
16132
+ "epoch": 8.293388429752067,
16133
+ "grad_norm": 0.09842690825462341,
16134
+ "learning_rate": 7.753609647123305e-07,
16135
+ "loss": 0.0925,
16136
+ "mean_token_accuracy": 0.9722222089767456,
16137
+ "step": 2007
16138
+ },
16139
+ {
16140
+ "epoch": 8.297520661157025,
16141
+ "grad_norm": 0.09228594601154327,
16142
+ "learning_rate": 7.717103295367473e-07,
16143
+ "loss": 0.1104,
16144
+ "mean_token_accuracy": 0.9669243693351746,
16145
+ "step": 2008
16146
+ },
16147
+ {
16148
+ "epoch": 8.301652892561984,
16149
+ "grad_norm": 0.09433568269014359,
16150
+ "learning_rate": 7.680675899853258e-07,
16151
+ "loss": 0.1252,
16152
+ "mean_token_accuracy": 0.9608188271522522,
16153
+ "step": 2009
16154
+ },
16155
+ {
16156
+ "epoch": 8.305785123966942,
16157
+ "grad_norm": 0.10079663246870041,
16158
+ "learning_rate": 7.644327528602757e-07,
16159
+ "loss": 0.1536,
16160
+ "mean_token_accuracy": 0.949020504951477,
16161
+ "step": 2010
16162
+ },
16163
+ {
16164
+ "epoch": 8.309917355371901,
16165
+ "grad_norm": 0.09348037093877792,
16166
+ "learning_rate": 7.608058249490457e-07,
16167
+ "loss": 0.1049,
16168
+ "mean_token_accuracy": 0.9676030874252319,
16169
+ "step": 2011
16170
+ },
16171
+ {
16172
+ "epoch": 8.314049586776859,
16173
+ "grad_norm": 0.09227565675973892,
16174
+ "learning_rate": 7.571868130243176e-07,
16175
+ "loss": 0.1086,
16176
+ "mean_token_accuracy": 0.96882164478302,
16177
+ "step": 2012
16178
+ },
16179
+ {
16180
+ "epoch": 8.318181818181818,
16181
+ "grad_norm": 0.10619546473026276,
16182
+ "learning_rate": 7.535757238439939e-07,
16183
+ "loss": 0.1186,
16184
+ "mean_token_accuracy": 0.9638972282409668,
16185
+ "step": 2013
16186
+ },
16187
+ {
16188
+ "epoch": 8.322314049586776,
16189
+ "grad_norm": 0.09314385056495667,
16190
+ "learning_rate": 7.499725641511762e-07,
16191
+ "loss": 0.0847,
16192
+ "mean_token_accuracy": 0.9747040867805481,
16193
+ "step": 2014
16194
+ },
16195
+ {
16196
+ "epoch": 8.326446280991735,
16197
+ "grad_norm": 0.09095818549394608,
16198
+ "learning_rate": 7.463773406741648e-07,
16199
+ "loss": 0.0946,
16200
+ "mean_token_accuracy": 0.9727653861045837,
16201
+ "step": 2015
16202
+ },
16203
+ {
16204
+ "epoch": 8.330578512396695,
16205
+ "grad_norm": 0.08989793807268143,
16206
+ "learning_rate": 7.427900601264388e-07,
16207
+ "loss": 0.1041,
16208
+ "mean_token_accuracy": 0.970187783241272,
16209
+ "step": 2016
16210
+ },
16211
+ {
16212
+ "epoch": 8.334710743801653,
16213
+ "grad_norm": 0.1408630758523941,
16214
+ "learning_rate": 7.392107292066452e-07,
16215
+ "loss": 0.2269,
16216
+ "mean_token_accuracy": 0.9238230586051941,
16217
+ "step": 2017
16218
+ },
16219
+ {
16220
+ "epoch": 8.338842975206612,
16221
+ "grad_norm": 0.07676363736391068,
16222
+ "learning_rate": 7.356393545985862e-07,
16223
+ "loss": 0.0831,
16224
+ "mean_token_accuracy": 0.977846622467041,
16225
+ "step": 2018
16226
+ },
16227
+ {
16228
+ "epoch": 8.34297520661157,
16229
+ "grad_norm": 0.11591339856386185,
16230
+ "learning_rate": 7.320759429712048e-07,
16231
+ "loss": 0.1068,
16232
+ "mean_token_accuracy": 0.9673469662666321,
16233
+ "step": 2019
16234
+ },
16235
+ {
16236
+ "epoch": 8.347107438016529,
16237
+ "grad_norm": 0.10131556540727615,
16238
+ "learning_rate": 7.285205009785784e-07,
16239
+ "loss": 0.0927,
16240
+ "mean_token_accuracy": 0.9739193320274353,
16241
+ "step": 2020
16242
+ },
16243
+ {
16244
+ "epoch": 8.351239669421487,
16245
+ "grad_norm": 0.09023724496364594,
16246
+ "learning_rate": 7.249730352599e-07,
16247
+ "loss": 0.0889,
16248
+ "mean_token_accuracy": 0.9725528359413147,
16249
+ "step": 2021
16250
+ },
16251
+ {
16252
+ "epoch": 8.355371900826446,
16253
+ "grad_norm": 0.0897325649857521,
16254
+ "learning_rate": 7.214335524394706e-07,
16255
+ "loss": 0.0785,
16256
+ "mean_token_accuracy": 0.9799548983573914,
16257
+ "step": 2022
16258
+ },
16259
+ {
16260
+ "epoch": 8.359504132231406,
16261
+ "grad_norm": 0.09022372215986252,
16262
+ "learning_rate": 7.179020591266794e-07,
16263
+ "loss": 0.1125,
16264
+ "mean_token_accuracy": 0.9685359001159668,
16265
+ "step": 2023
16266
+ },
16267
+ {
16268
+ "epoch": 8.363636363636363,
16269
+ "grad_norm": 0.08698549121618271,
16270
+ "learning_rate": 7.143785619160026e-07,
16271
+ "loss": 0.0951,
16272
+ "mean_token_accuracy": 0.9732397794723511,
16273
+ "step": 2024
16274
+ },
16275
+ {
16276
+ "epoch": 8.367768595041323,
16277
+ "grad_norm": 0.0980365052819252,
16278
+ "learning_rate": 7.108630673869805e-07,
16279
+ "loss": 0.1058,
16280
+ "mean_token_accuracy": 0.9667887091636658,
16281
+ "step": 2025
16282
+ },
16283
+ {
16284
+ "epoch": 8.37190082644628,
16285
+ "grad_norm": 0.10169877111911774,
16286
+ "learning_rate": 7.073555821042139e-07,
16287
+ "loss": 0.1002,
16288
+ "mean_token_accuracy": 0.9713459610939026,
16289
+ "step": 2026
16290
+ },
16291
+ {
16292
+ "epoch": 8.37603305785124,
16293
+ "grad_norm": 0.10198129713535309,
16294
+ "learning_rate": 7.038561126173437e-07,
16295
+ "loss": 0.1045,
16296
+ "mean_token_accuracy": 0.9714058637619019,
16297
+ "step": 2027
16298
+ },
16299
+ {
16300
+ "epoch": 8.380165289256198,
16301
+ "grad_norm": 0.10014763474464417,
16302
+ "learning_rate": 7.003646654610424e-07,
16303
+ "loss": 0.0886,
16304
+ "mean_token_accuracy": 0.9751999974250793,
16305
+ "step": 2028
16306
+ },
16307
+ {
16308
+ "epoch": 8.384297520661157,
16309
+ "grad_norm": 0.09548249840736389,
16310
+ "learning_rate": 6.968812471550063e-07,
16311
+ "loss": 0.0837,
16312
+ "mean_token_accuracy": 0.9789416790008545,
16313
+ "step": 2029
16314
+ },
16315
+ {
16316
+ "epoch": 8.388429752066116,
16317
+ "grad_norm": 0.10720735043287277,
16318
+ "learning_rate": 6.93405864203936e-07,
16319
+ "loss": 0.0906,
16320
+ "mean_token_accuracy": 0.9760934710502625,
16321
+ "step": 2030
16322
+ },
16323
+ {
16324
+ "epoch": 8.392561983471074,
16325
+ "grad_norm": 0.09425859898328781,
16326
+ "learning_rate": 6.899385230975297e-07,
16327
+ "loss": 0.0918,
16328
+ "mean_token_accuracy": 0.9751161932945251,
16329
+ "step": 2031
16330
+ },
16331
+ {
16332
+ "epoch": 8.396694214876034,
16333
+ "grad_norm": 0.11633366346359253,
16334
+ "learning_rate": 6.864792303104651e-07,
16335
+ "loss": 0.0996,
16336
+ "mean_token_accuracy": 0.9741970896720886,
16337
+ "step": 2032
16338
+ },
16339
+ {
16340
+ "epoch": 8.400826446280991,
16341
+ "grad_norm": 0.10742107778787613,
16342
+ "learning_rate": 6.830279923023946e-07,
16343
+ "loss": 0.0864,
16344
+ "mean_token_accuracy": 0.9759535789489746,
16345
+ "step": 2033
16346
+ },
16347
+ {
16348
+ "epoch": 8.40495867768595,
16349
+ "grad_norm": 0.10091706365346909,
16350
+ "learning_rate": 6.795848155179274e-07,
16351
+ "loss": 0.0884,
16352
+ "mean_token_accuracy": 0.9769123792648315,
16353
+ "step": 2034
16354
+ },
16355
+ {
16356
+ "epoch": 8.409090909090908,
16357
+ "grad_norm": 0.1497029811143875,
16358
+ "learning_rate": 6.761497063866207e-07,
16359
+ "loss": 0.1978,
16360
+ "mean_token_accuracy": 0.9377990365028381,
16361
+ "step": 2035
16362
+ },
16363
+ {
16364
+ "epoch": 8.413223140495868,
16365
+ "grad_norm": 0.10804083943367004,
16366
+ "learning_rate": 6.727226713229684e-07,
16367
+ "loss": 0.1319,
16368
+ "mean_token_accuracy": 0.9585747122764587,
16369
+ "step": 2036
16370
+ },
16371
+ {
16372
+ "epoch": 8.417355371900827,
16373
+ "grad_norm": 0.05786946043372154,
16374
+ "learning_rate": 6.693037167263828e-07,
16375
+ "loss": 0.2503,
16376
+ "mean_token_accuracy": 0.9130831360816956,
16377
+ "step": 2037
16378
+ },
16379
+ {
16380
+ "epoch": 8.421487603305785,
16381
+ "grad_norm": 0.07212464511394501,
16382
+ "learning_rate": 6.658928489811912e-07,
16383
+ "loss": 0.2339,
16384
+ "mean_token_accuracy": 0.9204217195510864,
16385
+ "step": 2038
16386
+ },
16387
+ {
16388
+ "epoch": 8.425619834710744,
16389
+ "grad_norm": 0.06576870381832123,
16390
+ "learning_rate": 6.624900744566193e-07,
16391
+ "loss": 0.2209,
16392
+ "mean_token_accuracy": 0.9230567812919617,
16393
+ "step": 2039
16394
+ },
16395
+ {
16396
+ "epoch": 8.429752066115702,
16397
+ "grad_norm": 0.07833580672740936,
16398
+ "learning_rate": 6.590953995067812e-07,
16399
+ "loss": 0.2051,
16400
+ "mean_token_accuracy": 0.9266378879547119,
16401
+ "step": 2040
16402
+ },
16403
+ {
16404
+ "epoch": 8.433884297520661,
16405
+ "grad_norm": 0.06369439512491226,
16406
+ "learning_rate": 6.557088304706627e-07,
16407
+ "loss": 0.1654,
16408
+ "mean_token_accuracy": 0.9431931376457214,
16409
+ "step": 2041
16410
+ },
16411
+ {
16412
+ "epoch": 8.438016528925619,
16413
+ "grad_norm": 0.08336784690618515,
16414
+ "learning_rate": 6.523303736721154e-07,
16415
+ "loss": 0.2379,
16416
+ "mean_token_accuracy": 0.9195821285247803,
16417
+ "step": 2042
16418
+ },
16419
+ {
16420
+ "epoch": 8.442148760330578,
16421
+ "grad_norm": 0.07352302968502045,
16422
+ "learning_rate": 6.489600354198433e-07,
16423
+ "loss": 0.1511,
16424
+ "mean_token_accuracy": 0.950443685054779,
16425
+ "step": 2043
16426
+ },
16427
+ {
16428
+ "epoch": 8.446280991735538,
16429
+ "grad_norm": 0.09255903214216232,
16430
+ "learning_rate": 6.455978220073895e-07,
16431
+ "loss": 0.2586,
16432
+ "mean_token_accuracy": 0.9118536710739136,
16433
+ "step": 2044
16434
+ },
16435
+ {
16436
+ "epoch": 8.450413223140496,
16437
+ "grad_norm": 0.07923895865678787,
16438
+ "learning_rate": 6.422437397131265e-07,
16439
+ "loss": 0.1312,
16440
+ "mean_token_accuracy": 0.9583396911621094,
16441
+ "step": 2045
16442
+ },
16443
+ {
16444
+ "epoch": 8.454545454545455,
16445
+ "grad_norm": 0.08579988032579422,
16446
+ "learning_rate": 6.388977948002406e-07,
16447
+ "loss": 0.1845,
16448
+ "mean_token_accuracy": 0.9370260238647461,
16449
+ "step": 2046
16450
+ },
16451
+ {
16452
+ "epoch": 8.458677685950413,
16453
+ "grad_norm": 0.08608614653348923,
16454
+ "learning_rate": 6.355599935167256e-07,
16455
+ "loss": 0.1863,
16456
+ "mean_token_accuracy": 0.9352179169654846,
16457
+ "step": 2047
16458
+ },
16459
+ {
16460
+ "epoch": 8.462809917355372,
16461
+ "grad_norm": 0.08685126155614853,
16462
+ "learning_rate": 6.322303420953673e-07,
16463
+ "loss": 0.1521,
16464
+ "mean_token_accuracy": 0.9517453908920288,
16465
+ "step": 2048
16466
+ },
16467
+ {
16468
+ "epoch": 8.46694214876033,
16469
+ "grad_norm": 0.09084443747997284,
16470
+ "learning_rate": 6.289088467537341e-07,
16471
+ "loss": 0.1344,
16472
+ "mean_token_accuracy": 0.955075740814209,
16473
+ "step": 2049
16474
+ },
16475
+ {
16476
+ "epoch": 8.47107438016529,
16477
+ "grad_norm": 0.10212317854166031,
16478
+ "learning_rate": 6.255955136941627e-07,
16479
+ "loss": 0.1827,
16480
+ "mean_token_accuracy": 0.9430245161056519,
16481
+ "step": 2050
16482
+ },
16483
+ {
16484
+ "epoch": 8.475206611570249,
16485
+ "grad_norm": 0.09634598344564438,
16486
+ "learning_rate": 6.222903491037474e-07,
16487
+ "loss": 0.1735,
16488
+ "mean_token_accuracy": 0.9435832500457764,
16489
+ "step": 2051
16490
+ },
16491
+ {
16492
+ "epoch": 8.479338842975206,
16493
+ "grad_norm": 0.07654455304145813,
16494
+ "learning_rate": 6.189933591543318e-07,
16495
+ "loss": 0.1052,
16496
+ "mean_token_accuracy": 0.9696394801139832,
16497
+ "step": 2052
16498
+ },
16499
+ {
16500
+ "epoch": 8.483471074380166,
16501
+ "grad_norm": 0.08662577718496323,
16502
+ "learning_rate": 6.157045500024933e-07,
16503
+ "loss": 0.1196,
16504
+ "mean_token_accuracy": 0.9640316367149353,
16505
+ "step": 2053
16506
+ },
16507
+ {
16508
+ "epoch": 8.487603305785123,
16509
+ "grad_norm": 0.0935806855559349,
16510
+ "learning_rate": 6.124239277895344e-07,
16511
+ "loss": 0.1112,
16512
+ "mean_token_accuracy": 0.9642053842544556,
16513
+ "step": 2054
16514
+ },
16515
+ {
16516
+ "epoch": 8.491735537190083,
16517
+ "grad_norm": 0.0817096158862114,
16518
+ "learning_rate": 6.091514986414665e-07,
16519
+ "loss": 0.0977,
16520
+ "mean_token_accuracy": 0.9727157354354858,
16521
+ "step": 2055
16522
+ },
16523
+ {
16524
+ "epoch": 8.49586776859504,
16525
+ "grad_norm": 0.08830783516168594,
16526
+ "learning_rate": 6.058872686690048e-07,
16527
+ "loss": 0.0979,
16528
+ "mean_token_accuracy": 0.9704757928848267,
16529
+ "step": 2056
16530
+ },
16531
+ {
16532
+ "epoch": 8.5,
16533
+ "grad_norm": 0.09076707810163498,
16534
+ "learning_rate": 6.026312439675553e-07,
16535
+ "loss": 0.0942,
16536
+ "mean_token_accuracy": 0.971440851688385,
16537
+ "step": 2057
16538
+ },
16539
+ {
16540
+ "epoch": 8.50413223140496,
16541
+ "grad_norm": 0.09585954248905182,
16542
+ "learning_rate": 5.993834306171964e-07,
16543
+ "loss": 0.1453,
16544
+ "mean_token_accuracy": 0.9532176852226257,
16545
+ "step": 2058
16546
+ },
16547
+ {
16548
+ "epoch": 8.508264462809917,
16549
+ "grad_norm": 0.10207119584083557,
16550
+ "learning_rate": 5.961438346826792e-07,
16551
+ "loss": 0.1159,
16552
+ "mean_token_accuracy": 0.9636322855949402,
16553
+ "step": 2059
16554
+ },
16555
+ {
16556
+ "epoch": 8.512396694214877,
16557
+ "grad_norm": 0.10537750273942947,
16558
+ "learning_rate": 5.929124622134058e-07,
16559
+ "loss": 0.1239,
16560
+ "mean_token_accuracy": 0.9623029232025146,
16561
+ "step": 2060
16562
+ },
16563
+ {
16564
+ "epoch": 8.516528925619834,
16565
+ "grad_norm": 0.08574347198009491,
16566
+ "learning_rate": 5.896893192434249e-07,
16567
+ "loss": 0.0818,
16568
+ "mean_token_accuracy": 0.97648686170578,
16569
+ "step": 2061
16570
+ },
16571
+ {
16572
+ "epoch": 8.520661157024794,
16573
+ "grad_norm": 0.1016509085893631,
16574
+ "learning_rate": 5.864744117914179e-07,
16575
+ "loss": 0.1009,
16576
+ "mean_token_accuracy": 0.9712011218070984,
16577
+ "step": 2062
16578
+ },
16579
+ {
16580
+ "epoch": 8.524793388429751,
16581
+ "grad_norm": 0.09164122492074966,
16582
+ "learning_rate": 5.832677458606867e-07,
16583
+ "loss": 0.0942,
16584
+ "mean_token_accuracy": 0.9732291102409363,
16585
+ "step": 2063
16586
+ },
16587
+ {
16588
+ "epoch": 8.52892561983471,
16589
+ "grad_norm": 0.08601871877908707,
16590
+ "learning_rate": 5.800693274391439e-07,
16591
+ "loss": 0.0799,
16592
+ "mean_token_accuracy": 0.9785696864128113,
16593
+ "step": 2064
16594
+ },
16595
+ {
16596
+ "epoch": 8.53305785123967,
16597
+ "grad_norm": 0.11451072990894318,
16598
+ "learning_rate": 5.768791624993003e-07,
16599
+ "loss": 0.159,
16600
+ "mean_token_accuracy": 0.9517607092857361,
16601
+ "step": 2065
16602
+ },
16603
+ {
16604
+ "epoch": 8.537190082644628,
16605
+ "grad_norm": 0.10741297900676727,
16606
+ "learning_rate": 5.736972569982558e-07,
16607
+ "loss": 0.1089,
16608
+ "mean_token_accuracy": 0.9691147804260254,
16609
+ "step": 2066
16610
+ },
16611
+ {
16612
+ "epoch": 8.541322314049587,
16613
+ "grad_norm": 0.11344994604587555,
16614
+ "learning_rate": 5.705236168776879e-07,
16615
+ "loss": 0.1463,
16616
+ "mean_token_accuracy": 0.9595220685005188,
16617
+ "step": 2067
16618
+ },
16619
+ {
16620
+ "epoch": 8.545454545454545,
16621
+ "grad_norm": 0.10220891237258911,
16622
+ "learning_rate": 5.673582480638395e-07,
16623
+ "loss": 0.1063,
16624
+ "mean_token_accuracy": 0.9685812592506409,
16625
+ "step": 2068
16626
+ },
16627
+ {
16628
+ "epoch": 8.549586776859504,
16629
+ "grad_norm": 0.08758968859910965,
16630
+ "learning_rate": 5.642011564675065e-07,
16631
+ "loss": 0.0872,
16632
+ "mean_token_accuracy": 0.9768015742301941,
16633
+ "step": 2069
16634
+ },
16635
+ {
16636
+ "epoch": 8.553719008264462,
16637
+ "grad_norm": 0.10929395258426666,
16638
+ "learning_rate": 5.610523479840297e-07,
16639
+ "loss": 0.0947,
16640
+ "mean_token_accuracy": 0.9717923402786255,
16641
+ "step": 2070
16642
+ },
16643
+ {
16644
+ "epoch": 8.557851239669422,
16645
+ "grad_norm": 0.11096024513244629,
16646
+ "learning_rate": 5.579118284932844e-07,
16647
+ "loss": 0.1365,
16648
+ "mean_token_accuracy": 0.9577394127845764,
16649
+ "step": 2071
16650
+ },
16651
+ {
16652
+ "epoch": 8.561983471074381,
16653
+ "grad_norm": 0.0956474244594574,
16654
+ "learning_rate": 5.547796038596637e-07,
16655
+ "loss": 0.1036,
16656
+ "mean_token_accuracy": 0.9721804261207581,
16657
+ "step": 2072
16658
+ },
16659
+ {
16660
+ "epoch": 8.566115702479339,
16661
+ "grad_norm": 0.09017419070005417,
16662
+ "learning_rate": 5.51655679932075e-07,
16663
+ "loss": 0.0806,
16664
+ "mean_token_accuracy": 0.9797005653381348,
16665
+ "step": 2073
16666
+ },
16667
+ {
16668
+ "epoch": 8.570247933884298,
16669
+ "grad_norm": 0.10327083617448807,
16670
+ "learning_rate": 5.485400625439219e-07,
16671
+ "loss": 0.1038,
16672
+ "mean_token_accuracy": 0.9685261249542236,
16673
+ "step": 2074
16674
+ },
16675
+ {
16676
+ "epoch": 8.574380165289256,
16677
+ "grad_norm": 0.10319899022579193,
16678
+ "learning_rate": 5.454327575131007e-07,
16679
+ "loss": 0.0907,
16680
+ "mean_token_accuracy": 0.9753340482711792,
16681
+ "step": 2075
16682
+ },
16683
+ {
16684
+ "epoch": 8.578512396694215,
16685
+ "grad_norm": 0.10876299440860748,
16686
+ "learning_rate": 5.423337706419846e-07,
16687
+ "loss": 0.1003,
16688
+ "mean_token_accuracy": 0.9718273282051086,
16689
+ "step": 2076
16690
+ },
16691
+ {
16692
+ "epoch": 8.582644628099173,
16693
+ "grad_norm": 0.10647837072610855,
16694
+ "learning_rate": 5.392431077174131e-07,
16695
+ "loss": 0.1686,
16696
+ "mean_token_accuracy": 0.948123037815094,
16697
+ "step": 2077
16698
+ },
16699
+ {
16700
+ "epoch": 8.586776859504132,
16701
+ "grad_norm": 0.09515678137540817,
16702
+ "learning_rate": 5.361607745106817e-07,
16703
+ "loss": 0.0908,
16704
+ "mean_token_accuracy": 0.9741514325141907,
16705
+ "step": 2078
16706
+ },
16707
+ {
16708
+ "epoch": 8.590909090909092,
16709
+ "grad_norm": 0.10034073889255524,
16710
+ "learning_rate": 5.330867767775333e-07,
16711
+ "loss": 0.0898,
16712
+ "mean_token_accuracy": 0.9729089736938477,
16713
+ "step": 2079
16714
+ },
16715
+ {
16716
+ "epoch": 8.59504132231405,
16717
+ "grad_norm": 0.09543359279632568,
16718
+ "learning_rate": 5.300211202581451e-07,
16719
+ "loss": 0.0814,
16720
+ "mean_token_accuracy": 0.9785924553871155,
16721
+ "step": 2080
16722
+ },
16723
+ {
16724
+ "epoch": 8.599173553719009,
16725
+ "grad_norm": 0.11278136074542999,
16726
+ "learning_rate": 5.269638106771174e-07,
16727
+ "loss": 0.1497,
16728
+ "mean_token_accuracy": 0.9543736577033997,
16729
+ "step": 2081
16730
+ },
16731
+ {
16732
+ "epoch": 8.603305785123966,
16733
+ "grad_norm": 0.08995888382196426,
16734
+ "learning_rate": 5.239148537434658e-07,
16735
+ "loss": 0.094,
16736
+ "mean_token_accuracy": 0.9726177453994751,
16737
+ "step": 2082
16738
+ },
16739
+ {
16740
+ "epoch": 8.607438016528926,
16741
+ "grad_norm": 0.10781515389680862,
16742
+ "learning_rate": 5.208742551506057e-07,
16743
+ "loss": 0.0955,
16744
+ "mean_token_accuracy": 0.9749103784561157,
16745
+ "step": 2083
16746
+ },
16747
+ {
16748
+ "epoch": 8.611570247933884,
16749
+ "grad_norm": 0.1258586049079895,
16750
+ "learning_rate": 5.178420205763484e-07,
16751
+ "loss": 0.1392,
16752
+ "mean_token_accuracy": 0.958977222442627,
16753
+ "step": 2084
16754
+ },
16755
+ {
16756
+ "epoch": 8.615702479338843,
16757
+ "grad_norm": 0.11668509989976883,
16758
+ "learning_rate": 5.148181556828847e-07,
16759
+ "loss": 0.0891,
16760
+ "mean_token_accuracy": 0.977047324180603,
16761
+ "step": 2085
16762
+ },
16763
+ {
16764
+ "epoch": 8.619834710743802,
16765
+ "grad_norm": 0.12788750231266022,
16766
+ "learning_rate": 5.118026661167774e-07,
16767
+ "loss": 0.1437,
16768
+ "mean_token_accuracy": 0.9538551568984985,
16769
+ "step": 2086
16770
+ },
16771
+ {
16772
+ "epoch": 8.62396694214876,
16773
+ "grad_norm": 0.05820296332240105,
16774
+ "learning_rate": 5.087955575089493e-07,
16775
+ "loss": 0.2225,
16776
+ "mean_token_accuracy": 0.9198437333106995,
16777
+ "step": 2087
16778
+ },
16779
+ {
16780
+ "epoch": 8.62809917355372,
16781
+ "grad_norm": 0.05990159511566162,
16782
+ "learning_rate": 5.057968354746706e-07,
16783
+ "loss": 0.2175,
16784
+ "mean_token_accuracy": 0.9251121282577515,
16785
+ "step": 2088
16786
+ },
16787
+ {
16788
+ "epoch": 8.632231404958677,
16789
+ "grad_norm": 0.06292181462049484,
16790
+ "learning_rate": 5.028065056135561e-07,
16791
+ "loss": 0.2055,
16792
+ "mean_token_accuracy": 0.9290311932563782,
16793
+ "step": 2089
16794
+ },
16795
+ {
16796
+ "epoch": 8.636363636363637,
16797
+ "grad_norm": 0.0732082948088646,
16798
+ "learning_rate": 4.998245735095459e-07,
16799
+ "loss": 0.2348,
16800
+ "mean_token_accuracy": 0.9177881479263306,
16801
+ "step": 2090
16802
+ },
16803
+ {
16804
+ "epoch": 8.640495867768594,
16805
+ "grad_norm": 0.06980929523706436,
16806
+ "learning_rate": 4.968510447309005e-07,
16807
+ "loss": 0.1921,
16808
+ "mean_token_accuracy": 0.9333738684654236,
16809
+ "step": 2091
16810
+ },
16811
+ {
16812
+ "epoch": 8.644628099173554,
16813
+ "grad_norm": 0.07125243544578552,
16814
+ "learning_rate": 4.938859248301863e-07,
16815
+ "loss": 0.1748,
16816
+ "mean_token_accuracy": 0.9390982389450073,
16817
+ "step": 2092
16818
+ },
16819
+ {
16820
+ "epoch": 8.648760330578511,
16821
+ "grad_norm": 0.07631165534257889,
16822
+ "learning_rate": 4.909292193442705e-07,
16823
+ "loss": 0.1969,
16824
+ "mean_token_accuracy": 0.9327918887138367,
16825
+ "step": 2093
16826
+ },
16827
+ {
16828
+ "epoch": 8.652892561983471,
16829
+ "grad_norm": 0.0754714235663414,
16830
+ "learning_rate": 4.87980933794307e-07,
16831
+ "loss": 0.179,
16832
+ "mean_token_accuracy": 0.942600429058075,
16833
+ "step": 2094
16834
+ },
16835
+ {
16836
+ "epoch": 8.65702479338843,
16837
+ "grad_norm": 0.10651316493749619,
16838
+ "learning_rate": 4.850410736857236e-07,
16839
+ "loss": 0.2142,
16840
+ "mean_token_accuracy": 0.9307475090026855,
16841
+ "step": 2095
16842
+ },
16843
+ {
16844
+ "epoch": 8.661157024793388,
16845
+ "grad_norm": 0.08369658142328262,
16846
+ "learning_rate": 4.821096445082208e-07,
16847
+ "loss": 0.1839,
16848
+ "mean_token_accuracy": 0.9419768452644348,
16849
+ "step": 2096
16850
+ },
16851
+ {
16852
+ "epoch": 8.665289256198347,
16853
+ "grad_norm": 0.08058687299489975,
16854
+ "learning_rate": 4.791866517357491e-07,
16855
+ "loss": 0.1606,
16856
+ "mean_token_accuracy": 0.950334906578064,
16857
+ "step": 2097
16858
+ },
16859
+ {
16860
+ "epoch": 8.669421487603305,
16861
+ "grad_norm": 0.08795657008886337,
16862
+ "learning_rate": 4.762721008265114e-07,
16863
+ "loss": 0.1745,
16864
+ "mean_token_accuracy": 0.9430282115936279,
16865
+ "step": 2098
16866
+ },
16867
+ {
16868
+ "epoch": 8.673553719008265,
16869
+ "grad_norm": 0.08912398666143417,
16870
+ "learning_rate": 4.733659972229437e-07,
16871
+ "loss": 0.1724,
16872
+ "mean_token_accuracy": 0.9450215101242065,
16873
+ "step": 2099
16874
+ },
16875
+ {
16876
+ "epoch": 8.677685950413224,
16877
+ "grad_norm": 0.08674637228250504,
16878
+ "learning_rate": 4.7046834635170956e-07,
16879
+ "loss": 0.1258,
16880
+ "mean_token_accuracy": 0.9621280431747437,
16881
+ "step": 2100
16882
+ },
16883
+ {
16884
+ "epoch": 8.681818181818182,
16885
+ "grad_norm": 0.0879029706120491,
16886
+ "learning_rate": 4.6757915362368567e-07,
16887
+ "loss": 0.1673,
16888
+ "mean_token_accuracy": 0.9448676109313965,
16889
+ "step": 2101
16890
+ },
16891
+ {
16892
+ "epoch": 8.685950413223141,
16893
+ "grad_norm": 0.07600904256105423,
16894
+ "learning_rate": 4.646984244339575e-07,
16895
+ "loss": 0.0938,
16896
+ "mean_token_accuracy": 0.9702988266944885,
16897
+ "step": 2102
16898
+ },
16899
+ {
16900
+ "epoch": 8.690082644628099,
16901
+ "grad_norm": 0.07806258648633957,
16902
+ "learning_rate": 4.61826164161806e-07,
16903
+ "loss": 0.1091,
16904
+ "mean_token_accuracy": 0.9627501964569092,
16905
+ "step": 2103
16906
+ },
16907
+ {
16908
+ "epoch": 8.694214876033058,
16909
+ "grad_norm": 0.09047354757785797,
16910
+ "learning_rate": 4.589623781706959e-07,
16911
+ "loss": 0.1213,
16912
+ "mean_token_accuracy": 0.9610835313796997,
16913
+ "step": 2104
16914
+ },
16915
+ {
16916
+ "epoch": 8.698347107438016,
16917
+ "grad_norm": 0.0882289707660675,
16918
+ "learning_rate": 4.5610707180826996e-07,
16919
+ "loss": 0.0973,
16920
+ "mean_token_accuracy": 0.9684313535690308,
16921
+ "step": 2105
16922
+ },
16923
+ {
16924
+ "epoch": 8.702479338842975,
16925
+ "grad_norm": 0.07399041950702667,
16926
+ "learning_rate": 4.532602504063344e-07,
16927
+ "loss": 0.1023,
16928
+ "mean_token_accuracy": 0.9700278043746948,
16929
+ "step": 2106
16930
+ },
16931
+ {
16932
+ "epoch": 8.706611570247933,
16933
+ "grad_norm": 0.0878264531493187,
16934
+ "learning_rate": 4.504219192808529e-07,
16935
+ "loss": 0.0977,
16936
+ "mean_token_accuracy": 0.970322847366333,
16937
+ "step": 2107
16938
+ },
16939
+ {
16940
+ "epoch": 8.710743801652892,
16941
+ "grad_norm": 0.0884372815489769,
16942
+ "learning_rate": 4.4759208373193365e-07,
16943
+ "loss": 0.1047,
16944
+ "mean_token_accuracy": 0.9682474136352539,
16945
+ "step": 2108
16946
+ },
16947
+ {
16948
+ "epoch": 8.714876033057852,
16949
+ "grad_norm": 0.09372899681329727,
16950
+ "learning_rate": 4.447707490438236e-07,
16951
+ "loss": 0.1364,
16952
+ "mean_token_accuracy": 0.957731306552887,
16953
+ "step": 2109
16954
+ },
16955
+ {
16956
+ "epoch": 8.71900826446281,
16957
+ "grad_norm": 0.09550356864929199,
16958
+ "learning_rate": 4.4195792048489226e-07,
16959
+ "loss": 0.1414,
16960
+ "mean_token_accuracy": 0.9544153213500977,
16961
+ "step": 2110
16962
+ },
16963
+ {
16964
+ "epoch": 8.723140495867769,
16965
+ "grad_norm": 0.07899662107229233,
16966
+ "learning_rate": 4.39153603307626e-07,
16967
+ "loss": 0.0792,
16968
+ "mean_token_accuracy": 0.9783337116241455,
16969
+ "step": 2111
16970
+ },
16971
+ {
16972
+ "epoch": 8.727272727272727,
16973
+ "grad_norm": 0.09830790758132935,
16974
+ "learning_rate": 4.363578027486187e-07,
16975
+ "loss": 0.1541,
16976
+ "mean_token_accuracy": 0.9491906762123108,
16977
+ "step": 2112
16978
+ },
16979
+ {
16980
+ "epoch": 8.731404958677686,
16981
+ "grad_norm": 0.09043899923563004,
16982
+ "learning_rate": 4.335705240285609e-07,
16983
+ "loss": 0.102,
16984
+ "mean_token_accuracy": 0.9689905047416687,
16985
+ "step": 2113
16986
+ },
16987
+ {
16988
+ "epoch": 8.735537190082646,
16989
+ "grad_norm": 0.09777707606554031,
16990
+ "learning_rate": 4.307917723522315e-07,
16991
+ "loss": 0.1352,
16992
+ "mean_token_accuracy": 0.9594070911407471,
16993
+ "step": 2114
16994
+ },
16995
+ {
16996
+ "epoch": 8.739669421487603,
16997
+ "grad_norm": 0.10187830775976181,
16998
+ "learning_rate": 4.2802155290848133e-07,
16999
+ "loss": 0.0885,
17000
+ "mean_token_accuracy": 0.9753796458244324,
17001
+ "step": 2115
17002
+ },
17003
+ {
17004
+ "epoch": 8.743801652892563,
17005
+ "grad_norm": 0.09098262339830399,
17006
+ "learning_rate": 4.2525987087023433e-07,
17007
+ "loss": 0.0822,
17008
+ "mean_token_accuracy": 0.9769821166992188,
17009
+ "step": 2116
17010
+ },
17011
+ {
17012
+ "epoch": 8.74793388429752,
17013
+ "grad_norm": 0.08996855467557907,
17014
+ "learning_rate": 4.225067313944703e-07,
17015
+ "loss": 0.097,
17016
+ "mean_token_accuracy": 0.9690431356430054,
17017
+ "step": 2117
17018
+ },
17019
+ {
17020
+ "epoch": 8.75206611570248,
17021
+ "grad_norm": 0.09314204007387161,
17022
+ "learning_rate": 4.1976213962221513e-07,
17023
+ "loss": 0.0865,
17024
+ "mean_token_accuracy": 0.9770414233207703,
17025
+ "step": 2118
17026
+ },
17027
+ {
17028
+ "epoch": 8.756198347107437,
17029
+ "grad_norm": 0.10382431000471115,
17030
+ "learning_rate": 4.1702610067853756e-07,
17031
+ "loss": 0.1087,
17032
+ "mean_token_accuracy": 0.9670014381408691,
17033
+ "step": 2119
17034
+ },
17035
+ {
17036
+ "epoch": 8.760330578512397,
17037
+ "grad_norm": 0.08995066583156586,
17038
+ "learning_rate": 4.1429861967253073e-07,
17039
+ "loss": 0.102,
17040
+ "mean_token_accuracy": 0.9701564311981201,
17041
+ "step": 2120
17042
+ },
17043
+ {
17044
+ "epoch": 8.764462809917354,
17045
+ "grad_norm": 0.11203364282846451,
17046
+ "learning_rate": 4.1157970169731057e-07,
17047
+ "loss": 0.1025,
17048
+ "mean_token_accuracy": 0.9696673154830933,
17049
+ "step": 2121
17050
+ },
17051
+ {
17052
+ "epoch": 8.768595041322314,
17053
+ "grad_norm": 0.111559197306633,
17054
+ "learning_rate": 4.088693518300019e-07,
17055
+ "loss": 0.1204,
17056
+ "mean_token_accuracy": 0.9662195444107056,
17057
+ "step": 2122
17058
+ },
17059
+ {
17060
+ "epoch": 8.772727272727273,
17061
+ "grad_norm": 0.10789839178323746,
17062
+ "learning_rate": 4.0616757513173123e-07,
17063
+ "loss": 0.1253,
17064
+ "mean_token_accuracy": 0.9634451866149902,
17065
+ "step": 2123
17066
+ },
17067
+ {
17068
+ "epoch": 8.776859504132231,
17069
+ "grad_norm": 0.0962676927447319,
17070
+ "learning_rate": 4.0347437664761237e-07,
17071
+ "loss": 0.1044,
17072
+ "mean_token_accuracy": 0.9675620794296265,
17073
+ "step": 2124
17074
+ },
17075
+ {
17076
+ "epoch": 8.78099173553719,
17077
+ "grad_norm": 0.09367375820875168,
17078
+ "learning_rate": 4.0078976140674465e-07,
17079
+ "loss": 0.0944,
17080
+ "mean_token_accuracy": 0.9761354923248291,
17081
+ "step": 2125
17082
+ },
17083
+ {
17084
+ "epoch": 8.785123966942148,
17085
+ "grad_norm": 0.09414532035589218,
17086
+ "learning_rate": 3.981137344221986e-07,
17087
+ "loss": 0.0882,
17088
+ "mean_token_accuracy": 0.9753566980361938,
17089
+ "step": 2126
17090
+ },
17091
+ {
17092
+ "epoch": 8.789256198347108,
17093
+ "grad_norm": 0.10240423679351807,
17094
+ "learning_rate": 3.9544630069100644e-07,
17095
+ "loss": 0.0886,
17096
+ "mean_token_accuracy": 0.9755526781082153,
17097
+ "step": 2127
17098
+ },
17099
+ {
17100
+ "epoch": 8.793388429752067,
17101
+ "grad_norm": 0.10124680399894714,
17102
+ "learning_rate": 3.9278746519415655e-07,
17103
+ "loss": 0.0851,
17104
+ "mean_token_accuracy": 0.9779295921325684,
17105
+ "step": 2128
17106
+ },
17107
+ {
17108
+ "epoch": 8.797520661157025,
17109
+ "grad_norm": 0.11509440094232559,
17110
+ "learning_rate": 3.901372328965769e-07,
17111
+ "loss": 0.0877,
17112
+ "mean_token_accuracy": 0.9781274795532227,
17113
+ "step": 2129
17114
+ },
17115
+ {
17116
+ "epoch": 8.801652892561984,
17117
+ "grad_norm": 0.12082704156637192,
17118
+ "learning_rate": 3.874956087471354e-07,
17119
+ "loss": 0.108,
17120
+ "mean_token_accuracy": 0.9729946255683899,
17121
+ "step": 2130
17122
+ },
17123
+ {
17124
+ "epoch": 8.805785123966942,
17125
+ "grad_norm": 0.0965445339679718,
17126
+ "learning_rate": 3.8486259767862243e-07,
17127
+ "loss": 0.0809,
17128
+ "mean_token_accuracy": 0.9787408113479614,
17129
+ "step": 2131
17130
+ },
17131
+ {
17132
+ "epoch": 8.809917355371901,
17133
+ "grad_norm": 0.11878959834575653,
17134
+ "learning_rate": 3.822382046077483e-07,
17135
+ "loss": 0.1459,
17136
+ "mean_token_accuracy": 0.9578744769096375,
17137
+ "step": 2132
17138
+ },
17139
+ {
17140
+ "epoch": 8.814049586776859,
17141
+ "grad_norm": 0.10381490737199783,
17142
+ "learning_rate": 3.7962243443512627e-07,
17143
+ "loss": 0.0811,
17144
+ "mean_token_accuracy": 0.9788123965263367,
17145
+ "step": 2133
17146
+ },
17147
+ {
17148
+ "epoch": 8.818181818181818,
17149
+ "grad_norm": 0.11626556515693665,
17150
+ "learning_rate": 3.7701529204526856e-07,
17151
+ "loss": 0.1372,
17152
+ "mean_token_accuracy": 0.9591605067253113,
17153
+ "step": 2134
17154
+ },
17155
+ {
17156
+ "epoch": 8.822314049586776,
17157
+ "grad_norm": 0.10921085625886917,
17158
+ "learning_rate": 3.744167823065814e-07,
17159
+ "loss": 0.0944,
17160
+ "mean_token_accuracy": 0.9741052985191345,
17161
+ "step": 2135
17162
+ },
17163
+ {
17164
+ "epoch": 8.826446280991735,
17165
+ "grad_norm": 0.1696995347738266,
17166
+ "learning_rate": 3.718269100713445e-07,
17167
+ "loss": 0.1855,
17168
+ "mean_token_accuracy": 0.9409846663475037,
17169
+ "step": 2136
17170
+ },
17171
+ {
17172
+ "epoch": 8.830578512396695,
17173
+ "grad_norm": 0.06852439790964127,
17174
+ "learning_rate": 3.692456801757133e-07,
17175
+ "loss": 0.2206,
17176
+ "mean_token_accuracy": 0.9229573011398315,
17177
+ "step": 2137
17178
+ },
17179
+ {
17180
+ "epoch": 8.834710743801653,
17181
+ "grad_norm": 0.07205780595541,
17182
+ "learning_rate": 3.6667309743970147e-07,
17183
+ "loss": 0.2139,
17184
+ "mean_token_accuracy": 0.9254477024078369,
17185
+ "step": 2138
17186
+ },
17187
+ {
17188
+ "epoch": 8.838842975206612,
17189
+ "grad_norm": 0.07377646863460541,
17190
+ "learning_rate": 3.641091666671781e-07,
17191
+ "loss": 0.1921,
17192
+ "mean_token_accuracy": 0.9347940683364868,
17193
+ "step": 2139
17194
+ },
17195
+ {
17196
+ "epoch": 8.84297520661157,
17197
+ "grad_norm": 0.08118410408496857,
17198
+ "learning_rate": 3.615538926458556e-07,
17199
+ "loss": 0.2315,
17200
+ "mean_token_accuracy": 0.9196543097496033,
17201
+ "step": 2140
17202
+ },
17203
+ {
17204
+ "epoch": 8.847107438016529,
17205
+ "grad_norm": 0.0802064910531044,
17206
+ "learning_rate": 3.5900728014728046e-07,
17207
+ "loss": 0.2082,
17208
+ "mean_token_accuracy": 0.9265360832214355,
17209
+ "step": 2141
17210
+ },
17211
+ {
17212
+ "epoch": 8.851239669421489,
17213
+ "grad_norm": 0.07559309154748917,
17214
+ "learning_rate": 3.564693339268266e-07,
17215
+ "loss": 0.2115,
17216
+ "mean_token_accuracy": 0.9263385534286499,
17217
+ "step": 2142
17218
+ },
17219
+ {
17220
+ "epoch": 8.855371900826446,
17221
+ "grad_norm": 0.07263598591089249,
17222
+ "learning_rate": 3.539400587236824e-07,
17223
+ "loss": 0.1836,
17224
+ "mean_token_accuracy": 0.9360730648040771,
17225
+ "step": 2143
17226
+ },
17227
+ {
17228
+ "epoch": 8.859504132231406,
17229
+ "grad_norm": 0.0771942138671875,
17230
+ "learning_rate": 3.514194592608489e-07,
17231
+ "loss": 0.1768,
17232
+ "mean_token_accuracy": 0.9406779408454895,
17233
+ "step": 2144
17234
+ },
17235
+ {
17236
+ "epoch": 8.863636363636363,
17237
+ "grad_norm": 0.1021052822470665,
17238
+ "learning_rate": 3.4890754024512254e-07,
17239
+ "loss": 0.2382,
17240
+ "mean_token_accuracy": 0.9189664721488953,
17241
+ "step": 2145
17242
+ },
17243
+ {
17244
+ "epoch": 8.867768595041323,
17245
+ "grad_norm": 0.07152920961380005,
17246
+ "learning_rate": 3.464043063670941e-07,
17247
+ "loss": 0.176,
17248
+ "mean_token_accuracy": 0.9418604373931885,
17249
+ "step": 2146
17250
+ },
17251
+ {
17252
+ "epoch": 8.87190082644628,
17253
+ "grad_norm": 0.09480854123830795,
17254
+ "learning_rate": 3.439097623011328e-07,
17255
+ "loss": 0.227,
17256
+ "mean_token_accuracy": 0.9247565865516663,
17257
+ "step": 2147
17258
+ },
17259
+ {
17260
+ "epoch": 8.87603305785124,
17261
+ "grad_norm": 0.06957484036684036,
17262
+ "learning_rate": 3.41423912705382e-07,
17263
+ "loss": 0.1406,
17264
+ "mean_token_accuracy": 0.9543790221214294,
17265
+ "step": 2148
17266
+ },
17267
+ {
17268
+ "epoch": 8.880165289256198,
17269
+ "grad_norm": 0.08442248404026031,
17270
+ "learning_rate": 3.389467622217524e-07,
17271
+ "loss": 0.1655,
17272
+ "mean_token_accuracy": 0.9483348727226257,
17273
+ "step": 2149
17274
+ },
17275
+ {
17276
+ "epoch": 8.884297520661157,
17277
+ "grad_norm": 0.08726052194833755,
17278
+ "learning_rate": 3.3647831547590714e-07,
17279
+ "loss": 0.171,
17280
+ "mean_token_accuracy": 0.9444353580474854,
17281
+ "step": 2150
17282
+ },
17283
+ {
17284
+ "epoch": 8.888429752066116,
17285
+ "grad_norm": 0.0831826776266098,
17286
+ "learning_rate": 3.340185770772586e-07,
17287
+ "loss": 0.1653,
17288
+ "mean_token_accuracy": 0.9473860859870911,
17289
+ "step": 2151
17290
+ },
17291
+ {
17292
+ "epoch": 8.892561983471074,
17293
+ "grad_norm": 0.07340402901172638,
17294
+ "learning_rate": 3.3156755161895647e-07,
17295
+ "loss": 0.126,
17296
+ "mean_token_accuracy": 0.9628297090530396,
17297
+ "step": 2152
17298
+ },
17299
+ {
17300
+ "epoch": 8.896694214876034,
17301
+ "grad_norm": 0.08021709322929382,
17302
+ "learning_rate": 3.2912524367788077e-07,
17303
+ "loss": 0.1286,
17304
+ "mean_token_accuracy": 0.9604715704917908,
17305
+ "step": 2153
17306
+ },
17307
+ {
17308
+ "epoch": 8.900826446280991,
17309
+ "grad_norm": 0.09082391858100891,
17310
+ "learning_rate": 3.26691657814634e-07,
17311
+ "loss": 0.1633,
17312
+ "mean_token_accuracy": 0.9478123188018799,
17313
+ "step": 2154
17314
+ },
17315
+ {
17316
+ "epoch": 8.90495867768595,
17317
+ "grad_norm": 0.07327866554260254,
17318
+ "learning_rate": 3.2426679857353205e-07,
17319
+ "loss": 0.1074,
17320
+ "mean_token_accuracy": 0.9684982895851135,
17321
+ "step": 2155
17322
+ },
17323
+ {
17324
+ "epoch": 8.909090909090908,
17325
+ "grad_norm": 0.10244060307741165,
17326
+ "learning_rate": 3.2185067048259245e-07,
17327
+ "loss": 0.2105,
17328
+ "mean_token_accuracy": 0.9306122660636902,
17329
+ "step": 2156
17330
+ },
17331
+ {
17332
+ "epoch": 8.913223140495868,
17333
+ "grad_norm": 0.10891727358102798,
17334
+ "learning_rate": 3.194432780535295e-07,
17335
+ "loss": 0.1737,
17336
+ "mean_token_accuracy": 0.9447806477546692,
17337
+ "step": 2157
17338
+ },
17339
+ {
17340
+ "epoch": 8.917355371900827,
17341
+ "grad_norm": 0.08316652476787567,
17342
+ "learning_rate": 3.1704462578174945e-07,
17343
+ "loss": 0.0987,
17344
+ "mean_token_accuracy": 0.9734411239624023,
17345
+ "step": 2158
17346
+ },
17347
+ {
17348
+ "epoch": 8.921487603305785,
17349
+ "grad_norm": 0.09768752753734589,
17350
+ "learning_rate": 3.146547181463322e-07,
17351
+ "loss": 0.1241,
17352
+ "mean_token_accuracy": 0.9615846276283264,
17353
+ "step": 2159
17354
+ },
17355
+ {
17356
+ "epoch": 8.925619834710744,
17357
+ "grad_norm": 0.09764862805604935,
17358
+ "learning_rate": 3.1227355961003183e-07,
17359
+ "loss": 0.1175,
17360
+ "mean_token_accuracy": 0.962368905544281,
17361
+ "step": 2160
17362
+ },
17363
+ {
17364
+ "epoch": 8.929752066115702,
17365
+ "grad_norm": 0.09442981332540512,
17366
+ "learning_rate": 3.099011546192621e-07,
17367
+ "loss": 0.1053,
17368
+ "mean_token_accuracy": 0.9669612646102905,
17369
+ "step": 2161
17370
+ },
17371
+ {
17372
+ "epoch": 8.933884297520661,
17373
+ "grad_norm": 0.08316774666309357,
17374
+ "learning_rate": 3.075375076040943e-07,
17375
+ "loss": 0.092,
17376
+ "mean_token_accuracy": 0.971761167049408,
17377
+ "step": 2162
17378
+ },
17379
+ {
17380
+ "epoch": 8.938016528925619,
17381
+ "grad_norm": 0.09355759620666504,
17382
+ "learning_rate": 3.051826229782451e-07,
17383
+ "loss": 0.1264,
17384
+ "mean_token_accuracy": 0.9610360860824585,
17385
+ "step": 2163
17386
+ },
17387
+ {
17388
+ "epoch": 8.942148760330578,
17389
+ "grad_norm": 0.08833235502243042,
17390
+ "learning_rate": 3.0283650513906524e-07,
17391
+ "loss": 0.0889,
17392
+ "mean_token_accuracy": 0.9773091673851013,
17393
+ "step": 2164
17394
+ },
17395
+ {
17396
+ "epoch": 8.946280991735538,
17397
+ "grad_norm": 0.1082148551940918,
17398
+ "learning_rate": 3.0049915846753983e-07,
17399
+ "loss": 0.0873,
17400
+ "mean_token_accuracy": 0.9752772450447083,
17401
+ "step": 2165
17402
+ },
17403
+ {
17404
+ "epoch": 8.950413223140496,
17405
+ "grad_norm": 0.09726337343454361,
17406
+ "learning_rate": 2.981705873282714e-07,
17407
+ "loss": 0.0871,
17408
+ "mean_token_accuracy": 0.97633957862854,
17409
+ "step": 2166
17410
+ },
17411
+ {
17412
+ "epoch": 8.954545454545455,
17413
+ "grad_norm": 0.09049960970878601,
17414
+ "learning_rate": 2.9585079606947843e-07,
17415
+ "loss": 0.1021,
17416
+ "mean_token_accuracy": 0.9693925380706787,
17417
+ "step": 2167
17418
+ },
17419
+ {
17420
+ "epoch": 8.958677685950413,
17421
+ "grad_norm": 0.10062138736248016,
17422
+ "learning_rate": 2.9353978902298296e-07,
17423
+ "loss": 0.1027,
17424
+ "mean_token_accuracy": 0.9701105952262878,
17425
+ "step": 2168
17426
+ },
17427
+ {
17428
+ "epoch": 8.962809917355372,
17429
+ "grad_norm": 0.09967434406280518,
17430
+ "learning_rate": 2.9123757050420476e-07,
17431
+ "loss": 0.106,
17432
+ "mean_token_accuracy": 0.9688540101051331,
17433
+ "step": 2169
17434
+ },
17435
+ {
17436
+ "epoch": 8.96694214876033,
17437
+ "grad_norm": 0.08865738660097122,
17438
+ "learning_rate": 2.889441448121516e-07,
17439
+ "loss": 0.1013,
17440
+ "mean_token_accuracy": 0.970402717590332,
17441
+ "step": 2170
17442
+ },
17443
+ {
17444
+ "epoch": 8.97107438016529,
17445
+ "grad_norm": 0.11323466897010803,
17446
+ "learning_rate": 2.8665951622941225e-07,
17447
+ "loss": 0.138,
17448
+ "mean_token_accuracy": 0.9595091938972473,
17449
+ "step": 2171
17450
+ },
17451
+ {
17452
+ "epoch": 8.975206611570249,
17453
+ "grad_norm": 0.08644384145736694,
17454
+ "learning_rate": 2.843836890221502e-07,
17455
+ "loss": 0.0865,
17456
+ "mean_token_accuracy": 0.9795562028884888,
17457
+ "step": 2172
17458
+ },
17459
+ {
17460
+ "epoch": 8.979338842975206,
17461
+ "grad_norm": 0.10215216130018234,
17462
+ "learning_rate": 2.821166674400905e-07,
17463
+ "loss": 0.1145,
17464
+ "mean_token_accuracy": 0.9649077653884888,
17465
+ "step": 2173
17466
+ },
17467
+ {
17468
+ "epoch": 8.983471074380166,
17469
+ "grad_norm": 0.11719004064798355,
17470
+ "learning_rate": 2.798584557165185e-07,
17471
+ "loss": 0.1057,
17472
+ "mean_token_accuracy": 0.969704806804657,
17473
+ "step": 2174
17474
+ },
17475
+ {
17476
+ "epoch": 8.987603305785123,
17477
+ "grad_norm": 0.10035425424575806,
17478
+ "learning_rate": 2.7760905806826554e-07,
17479
+ "loss": 0.095,
17480
+ "mean_token_accuracy": 0.9760020971298218,
17481
+ "step": 2175
17482
+ },
17483
+ {
17484
+ "epoch": 8.991735537190083,
17485
+ "grad_norm": 0.12130289524793625,
17486
+ "learning_rate": 2.753684786957067e-07,
17487
+ "loss": 0.1554,
17488
+ "mean_token_accuracy": 0.951777994632721,
17489
+ "step": 2176
17490
+ },
17491
+ {
17492
+ "epoch": 8.99586776859504,
17493
+ "grad_norm": 0.0855918899178505,
17494
+ "learning_rate": 2.7313672178274906e-07,
17495
+ "loss": 0.0821,
17496
+ "mean_token_accuracy": 0.9829513430595398,
17497
+ "step": 2177
17498
+ },
17499
+ {
17500
+ "epoch": 9.0,
17501
+ "grad_norm": 0.12811078131198883,
17502
+ "learning_rate": 2.7091379149682683e-07,
17503
+ "loss": 0.1026,
17504
+ "mean_token_accuracy": 0.973548173904419,
17505
+ "step": 2178
17506
+ },
17507
+ {
17508
+ "epoch": 9.0,
17509
+ "eval_loss": 0.1644172966480255,
17510
+ "eval_mean_token_accuracy": 0.9770992398262024,
17511
+ "eval_runtime": 0.2104,
17512
+ "eval_samples_per_second": 23.768,
17513
+ "eval_steps_per_second": 4.754,
17514
+ "step": 2178
17515
  }
17516
  ],
17517
  "logging_steps": 1,
 
17531
  "attributes": {}
17532
  }
17533
  },
17534
+ "total_flos": 1.0074075399098204e+18,
17535
  "train_batch_size": 2,
17536
  "trial_name": null,
17537
  "trial_params": null