romainnn commited on
Commit
6b9a829
·
verified ·
1 Parent(s): d09faf8

Training in progress, step 600, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8386a050c5a0d6ac30c29464246237b5f77f6acfd82b1279bfad77b0338f6639
3
  size 159967880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61fa39fe72d3b07877b75fe39479b59104182f209a2615a3e426dd500c69d610
3
  size 159967880
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e11b8e0c7e6ef4ebe8274ebcc4f99efbb33e30279c8935ed93a8c6d481a124af
3
  size 81730644
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9b75154a8ac4a6cfadd72600ba7b4722dee87df5c3d0af9be47b403fa2dcf82
3
  size 81730644
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3b7047cccb255e2e5a046a928df43a7a12de98b75b435e3b319fb412f41b97b6
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d6b5d9c6f0c15be87f5391bf4c24c5abc24693b4a4ae1f1e316a59c22584e8b
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:343f8f4bfa74d3f8aa3d5cc6579e91dcd37c40481c5d048478a6e3e523bb44f8
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3fee25a74f8614e14501bbc1e4dbf79c416bb01f3b851bb0f5f52f090982cd43
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 0.7969969511032104,
3
- "best_model_checkpoint": "miner_id_24/checkpoint-500",
4
- "epoch": 0.03482500435312554,
5
  "eval_steps": 100,
6
- "global_step": 500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -3555,6 +3555,714 @@
3555
  "eval_samples_per_second": 7.14,
3556
  "eval_steps_per_second": 1.785,
3557
  "step": 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3558
  }
3559
  ],
3560
  "logging_steps": 1,
@@ -3583,7 +4291,7 @@
3583
  "attributes": {}
3584
  }
3585
  },
3586
- "total_flos": 1.359663190769664e+18,
3587
  "train_batch_size": 4,
3588
  "trial_name": null,
3589
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.7635987401008606,
3
+ "best_model_checkpoint": "miner_id_24/checkpoint-600",
4
+ "epoch": 0.041790005223750655,
5
  "eval_steps": 100,
6
+ "global_step": 600,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
3555
  "eval_samples_per_second": 7.14,
3556
  "eval_steps_per_second": 1.785,
3557
  "step": 500
3558
+ },
3559
+ {
3560
+ "epoch": 0.0348946543618318,
3561
+ "grad_norm": 0.5795607566833496,
3562
+ "learning_rate": 9.572925909767412e-05,
3563
+ "loss": 0.4495,
3564
+ "step": 501
3565
+ },
3566
+ {
3567
+ "epoch": 0.034964304370538046,
3568
+ "grad_norm": 0.6874101161956787,
3569
+ "learning_rate": 9.540096409419296e-05,
3570
+ "loss": 0.8444,
3571
+ "step": 502
3572
+ },
3573
+ {
3574
+ "epoch": 0.035033954379244295,
3575
+ "grad_norm": 0.5595911145210266,
3576
+ "learning_rate": 9.507271875570381e-05,
3577
+ "loss": 0.9391,
3578
+ "step": 503
3579
+ },
3580
+ {
3581
+ "epoch": 0.03510360438795055,
3582
+ "grad_norm": 0.525644063949585,
3583
+ "learning_rate": 9.474452662692838e-05,
3584
+ "loss": 0.7833,
3585
+ "step": 504
3586
+ },
3587
+ {
3588
+ "epoch": 0.0351732543966568,
3589
+ "grad_norm": 0.6366891264915466,
3590
+ "learning_rate": 9.441639125201368e-05,
3591
+ "loss": 1.0472,
3592
+ "step": 505
3593
+ },
3594
+ {
3595
+ "epoch": 0.035242904405363054,
3596
+ "grad_norm": 0.8487269878387451,
3597
+ "learning_rate": 9.408831617449385e-05,
3598
+ "loss": 1.0513,
3599
+ "step": 506
3600
+ },
3601
+ {
3602
+ "epoch": 0.0353125544140693,
3603
+ "grad_norm": 0.7027648091316223,
3604
+ "learning_rate": 9.376030493725189e-05,
3605
+ "loss": 0.9505,
3606
+ "step": 507
3607
+ },
3608
+ {
3609
+ "epoch": 0.03538220442277555,
3610
+ "grad_norm": 0.6772575974464417,
3611
+ "learning_rate": 9.343236108248139e-05,
3612
+ "loss": 1.0417,
3613
+ "step": 508
3614
+ },
3615
+ {
3616
+ "epoch": 0.03545185443148181,
3617
+ "grad_norm": 0.5657368898391724,
3618
+ "learning_rate": 9.310448815164826e-05,
3619
+ "loss": 0.9236,
3620
+ "step": 509
3621
+ },
3622
+ {
3623
+ "epoch": 0.035521504440188055,
3624
+ "grad_norm": 0.64215087890625,
3625
+ "learning_rate": 9.277668968545253e-05,
3626
+ "loss": 1.0035,
3627
+ "step": 510
3628
+ },
3629
+ {
3630
+ "epoch": 0.035591154448894304,
3631
+ "grad_norm": 0.6276829242706299,
3632
+ "learning_rate": 9.244896922379007e-05,
3633
+ "loss": 0.8375,
3634
+ "step": 511
3635
+ },
3636
+ {
3637
+ "epoch": 0.03566080445760056,
3638
+ "grad_norm": 0.5804170966148376,
3639
+ "learning_rate": 9.212133030571437e-05,
3640
+ "loss": 0.4934,
3641
+ "step": 512
3642
+ },
3643
+ {
3644
+ "epoch": 0.03573045446630681,
3645
+ "grad_norm": 0.7230868935585022,
3646
+ "learning_rate": 9.17937764693983e-05,
3647
+ "loss": 0.9427,
3648
+ "step": 513
3649
+ },
3650
+ {
3651
+ "epoch": 0.035800104475013056,
3652
+ "grad_norm": 0.6632394194602966,
3653
+ "learning_rate": 9.146631125209607e-05,
3654
+ "loss": 0.4176,
3655
+ "step": 514
3656
+ },
3657
+ {
3658
+ "epoch": 0.03586975448371931,
3659
+ "grad_norm": 0.5885234475135803,
3660
+ "learning_rate": 9.113893819010475e-05,
3661
+ "loss": 0.6042,
3662
+ "step": 515
3663
+ },
3664
+ {
3665
+ "epoch": 0.03593940449242556,
3666
+ "grad_norm": 0.5666863322257996,
3667
+ "learning_rate": 9.081166081872626e-05,
3668
+ "loss": 1.5152,
3669
+ "step": 516
3670
+ },
3671
+ {
3672
+ "epoch": 0.036009054501131815,
3673
+ "grad_norm": 0.7007538676261902,
3674
+ "learning_rate": 9.048448267222918e-05,
3675
+ "loss": 0.9444,
3676
+ "step": 517
3677
+ },
3678
+ {
3679
+ "epoch": 0.036078704509838064,
3680
+ "grad_norm": 0.6212923526763916,
3681
+ "learning_rate": 9.015740728381054e-05,
3682
+ "loss": 0.634,
3683
+ "step": 518
3684
+ },
3685
+ {
3686
+ "epoch": 0.03614835451854431,
3687
+ "grad_norm": 0.6189596056938171,
3688
+ "learning_rate": 8.98304381855577e-05,
3689
+ "loss": 1.1091,
3690
+ "step": 519
3691
+ },
3692
+ {
3693
+ "epoch": 0.03621800452725057,
3694
+ "grad_norm": 0.6159670948982239,
3695
+ "learning_rate": 8.95035789084102e-05,
3696
+ "loss": 0.787,
3697
+ "step": 520
3698
+ },
3699
+ {
3700
+ "epoch": 0.036287654535956816,
3701
+ "grad_norm": 0.6371515989303589,
3702
+ "learning_rate": 8.917683298212158e-05,
3703
+ "loss": 0.6172,
3704
+ "step": 521
3705
+ },
3706
+ {
3707
+ "epoch": 0.036357304544663065,
3708
+ "grad_norm": 0.6314066052436829,
3709
+ "learning_rate": 8.885020393522135e-05,
3710
+ "loss": 0.9702,
3711
+ "step": 522
3712
+ },
3713
+ {
3714
+ "epoch": 0.03642695455336932,
3715
+ "grad_norm": 0.6285626888275146,
3716
+ "learning_rate": 8.852369529497679e-05,
3717
+ "loss": 0.9819,
3718
+ "step": 523
3719
+ },
3720
+ {
3721
+ "epoch": 0.03649660456207557,
3722
+ "grad_norm": 0.5257949233055115,
3723
+ "learning_rate": 8.819731058735501e-05,
3724
+ "loss": 0.8288,
3725
+ "step": 524
3726
+ },
3727
+ {
3728
+ "epoch": 0.036566254570781824,
3729
+ "grad_norm": 0.611438512802124,
3730
+ "learning_rate": 8.787105333698465e-05,
3731
+ "loss": 0.9246,
3732
+ "step": 525
3733
+ },
3734
+ {
3735
+ "epoch": 0.03663590457948807,
3736
+ "grad_norm": 0.5995710492134094,
3737
+ "learning_rate": 8.754492706711798e-05,
3738
+ "loss": 0.6855,
3739
+ "step": 526
3740
+ },
3741
+ {
3742
+ "epoch": 0.03670555458819432,
3743
+ "grad_norm": 0.681425154209137,
3744
+ "learning_rate": 8.721893529959287e-05,
3745
+ "loss": 1.1644,
3746
+ "step": 527
3747
+ },
3748
+ {
3749
+ "epoch": 0.036775204596900576,
3750
+ "grad_norm": 0.7111718654632568,
3751
+ "learning_rate": 8.68930815547946e-05,
3752
+ "loss": 0.9181,
3753
+ "step": 528
3754
+ },
3755
+ {
3756
+ "epoch": 0.036844854605606825,
3757
+ "grad_norm": 0.5794047713279724,
3758
+ "learning_rate": 8.656736935161802e-05,
3759
+ "loss": 1.061,
3760
+ "step": 529
3761
+ },
3762
+ {
3763
+ "epoch": 0.03691450461431307,
3764
+ "grad_norm": 0.5971503257751465,
3765
+ "learning_rate": 8.624180220742946e-05,
3766
+ "loss": 0.5903,
3767
+ "step": 530
3768
+ },
3769
+ {
3770
+ "epoch": 0.03698415462301933,
3771
+ "grad_norm": 0.7091482281684875,
3772
+ "learning_rate": 8.59163836380287e-05,
3773
+ "loss": 0.8907,
3774
+ "step": 531
3775
+ },
3776
+ {
3777
+ "epoch": 0.03705380463172558,
3778
+ "grad_norm": 0.6185580492019653,
3779
+ "learning_rate": 8.559111715761114e-05,
3780
+ "loss": 0.8452,
3781
+ "step": 532
3782
+ },
3783
+ {
3784
+ "epoch": 0.03712345464043183,
3785
+ "grad_norm": 0.68827223777771,
3786
+ "learning_rate": 8.52660062787297e-05,
3787
+ "loss": 0.8711,
3788
+ "step": 533
3789
+ },
3790
+ {
3791
+ "epoch": 0.03719310464913808,
3792
+ "grad_norm": 0.6279632449150085,
3793
+ "learning_rate": 8.494105451225704e-05,
3794
+ "loss": 0.6453,
3795
+ "step": 534
3796
+ },
3797
+ {
3798
+ "epoch": 0.03726275465784433,
3799
+ "grad_norm": 0.7252237200737,
3800
+ "learning_rate": 8.461626536734753e-05,
3801
+ "loss": 1.1148,
3802
+ "step": 535
3803
+ },
3804
+ {
3805
+ "epoch": 0.037332404666550585,
3806
+ "grad_norm": 0.6377342939376831,
3807
+ "learning_rate": 8.429164235139931e-05,
3808
+ "loss": 1.0532,
3809
+ "step": 536
3810
+ },
3811
+ {
3812
+ "epoch": 0.037402054675256834,
3813
+ "grad_norm": 0.7409278154373169,
3814
+ "learning_rate": 8.396718897001663e-05,
3815
+ "loss": 1.0161,
3816
+ "step": 537
3817
+ },
3818
+ {
3819
+ "epoch": 0.03747170468396308,
3820
+ "grad_norm": 0.6048555970191956,
3821
+ "learning_rate": 8.364290872697173e-05,
3822
+ "loss": 1.012,
3823
+ "step": 538
3824
+ },
3825
+ {
3826
+ "epoch": 0.03754135469266934,
3827
+ "grad_norm": 0.7676815390586853,
3828
+ "learning_rate": 8.331880512416724e-05,
3829
+ "loss": 0.9402,
3830
+ "step": 539
3831
+ },
3832
+ {
3833
+ "epoch": 0.037611004701375586,
3834
+ "grad_norm": 0.6360906958580017,
3835
+ "learning_rate": 8.299488166159817e-05,
3836
+ "loss": 0.4591,
3837
+ "step": 540
3838
+ },
3839
+ {
3840
+ "epoch": 0.03768065471008184,
3841
+ "grad_norm": 0.6816183924674988,
3842
+ "learning_rate": 8.267114183731421e-05,
3843
+ "loss": 0.661,
3844
+ "step": 541
3845
+ },
3846
+ {
3847
+ "epoch": 0.03775030471878809,
3848
+ "grad_norm": 0.6955873966217041,
3849
+ "learning_rate": 8.234758914738199e-05,
3850
+ "loss": 0.8015,
3851
+ "step": 542
3852
+ },
3853
+ {
3854
+ "epoch": 0.03781995472749434,
3855
+ "grad_norm": 0.787493884563446,
3856
+ "learning_rate": 8.20242270858472e-05,
3857
+ "loss": 0.6941,
3858
+ "step": 543
3859
+ },
3860
+ {
3861
+ "epoch": 0.037889604736200594,
3862
+ "grad_norm": 0.5939062833786011,
3863
+ "learning_rate": 8.170105914469702e-05,
3864
+ "loss": 0.9034,
3865
+ "step": 544
3866
+ },
3867
+ {
3868
+ "epoch": 0.03795925474490684,
3869
+ "grad_norm": 0.5235042572021484,
3870
+ "learning_rate": 8.137808881382226e-05,
3871
+ "loss": 1.0283,
3872
+ "step": 545
3873
+ },
3874
+ {
3875
+ "epoch": 0.03802890475361309,
3876
+ "grad_norm": 0.7017082571983337,
3877
+ "learning_rate": 8.105531958097972e-05,
3878
+ "loss": 1.0407,
3879
+ "step": 546
3880
+ },
3881
+ {
3882
+ "epoch": 0.038098554762319346,
3883
+ "grad_norm": 0.7762130498886108,
3884
+ "learning_rate": 8.073275493175464e-05,
3885
+ "loss": 0.7814,
3886
+ "step": 547
3887
+ },
3888
+ {
3889
+ "epoch": 0.038168204771025595,
3890
+ "grad_norm": 0.588405191898346,
3891
+ "learning_rate": 8.041039834952287e-05,
3892
+ "loss": 0.8832,
3893
+ "step": 548
3894
+ },
3895
+ {
3896
+ "epoch": 0.03823785477973185,
3897
+ "grad_norm": 0.7792285084724426,
3898
+ "learning_rate": 8.008825331541335e-05,
3899
+ "loss": 1.051,
3900
+ "step": 549
3901
+ },
3902
+ {
3903
+ "epoch": 0.0383075047884381,
3904
+ "grad_norm": 0.6209467649459839,
3905
+ "learning_rate": 7.976632330827056e-05,
3906
+ "loss": 0.8802,
3907
+ "step": 550
3908
+ },
3909
+ {
3910
+ "epoch": 0.03837715479714435,
3911
+ "grad_norm": 0.5231680274009705,
3912
+ "learning_rate": 7.944461180461686e-05,
3913
+ "loss": 0.7529,
3914
+ "step": 551
3915
+ },
3916
+ {
3917
+ "epoch": 0.0384468048058506,
3918
+ "grad_norm": 0.6021607518196106,
3919
+ "learning_rate": 7.912312227861503e-05,
3920
+ "loss": 1.1235,
3921
+ "step": 552
3922
+ },
3923
+ {
3924
+ "epoch": 0.03851645481455685,
3925
+ "grad_norm": 0.5573668479919434,
3926
+ "learning_rate": 7.880185820203065e-05,
3927
+ "loss": 0.6753,
3928
+ "step": 553
3929
+ },
3930
+ {
3931
+ "epoch": 0.0385861048232631,
3932
+ "grad_norm": 0.5354910492897034,
3933
+ "learning_rate": 7.848082304419478e-05,
3934
+ "loss": 0.6843,
3935
+ "step": 554
3936
+ },
3937
+ {
3938
+ "epoch": 0.038655754831969355,
3939
+ "grad_norm": 0.606436014175415,
3940
+ "learning_rate": 7.816002027196627e-05,
3941
+ "loss": 1.0557,
3942
+ "step": 555
3943
+ },
3944
+ {
3945
+ "epoch": 0.038725404840675604,
3946
+ "grad_norm": 0.6580552458763123,
3947
+ "learning_rate": 7.783945334969451e-05,
3948
+ "loss": 0.6222,
3949
+ "step": 556
3950
+ },
3951
+ {
3952
+ "epoch": 0.03879505484938186,
3953
+ "grad_norm": 0.6174128651618958,
3954
+ "learning_rate": 7.751912573918193e-05,
3955
+ "loss": 0.8194,
3956
+ "step": 557
3957
+ },
3958
+ {
3959
+ "epoch": 0.03886470485808811,
3960
+ "grad_norm": 0.6724019646644592,
3961
+ "learning_rate": 7.719904089964658e-05,
3962
+ "loss": 1.0095,
3963
+ "step": 558
3964
+ },
3965
+ {
3966
+ "epoch": 0.038934354866794356,
3967
+ "grad_norm": 0.7200993299484253,
3968
+ "learning_rate": 7.687920228768493e-05,
3969
+ "loss": 0.8115,
3970
+ "step": 559
3971
+ },
3972
+ {
3973
+ "epoch": 0.03900400487550061,
3974
+ "grad_norm": 0.5682472586631775,
3975
+ "learning_rate": 7.655961335723433e-05,
3976
+ "loss": 0.7034,
3977
+ "step": 560
3978
+ },
3979
+ {
3980
+ "epoch": 0.03907365488420686,
3981
+ "grad_norm": 0.7236086130142212,
3982
+ "learning_rate": 7.624027755953592e-05,
3983
+ "loss": 0.9028,
3984
+ "step": 561
3985
+ },
3986
+ {
3987
+ "epoch": 0.03914330489291311,
3988
+ "grad_norm": 0.5866789221763611,
3989
+ "learning_rate": 7.592119834309715e-05,
3990
+ "loss": 0.8919,
3991
+ "step": 562
3992
+ },
3993
+ {
3994
+ "epoch": 0.039212954901619364,
3995
+ "grad_norm": 0.6271937489509583,
3996
+ "learning_rate": 7.560237915365472e-05,
3997
+ "loss": 0.6447,
3998
+ "step": 563
3999
+ },
4000
+ {
4001
+ "epoch": 0.03928260491032561,
4002
+ "grad_norm": 0.5319473147392273,
4003
+ "learning_rate": 7.528382343413734e-05,
4004
+ "loss": 1.0977,
4005
+ "step": 564
4006
+ },
4007
+ {
4008
+ "epoch": 0.03935225491903187,
4009
+ "grad_norm": 0.673537015914917,
4010
+ "learning_rate": 7.49655346246284e-05,
4011
+ "loss": 0.6669,
4012
+ "step": 565
4013
+ },
4014
+ {
4015
+ "epoch": 0.039421904927738116,
4016
+ "grad_norm": 0.7043957114219666,
4017
+ "learning_rate": 7.464751616232902e-05,
4018
+ "loss": 0.6334,
4019
+ "step": 566
4020
+ },
4021
+ {
4022
+ "epoch": 0.039491554936444365,
4023
+ "grad_norm": 0.6532731652259827,
4024
+ "learning_rate": 7.432977148152074e-05,
4025
+ "loss": 0.659,
4026
+ "step": 567
4027
+ },
4028
+ {
4029
+ "epoch": 0.03956120494515062,
4030
+ "grad_norm": 0.6882482767105103,
4031
+ "learning_rate": 7.401230401352866e-05,
4032
+ "loss": 0.711,
4033
+ "step": 568
4034
+ },
4035
+ {
4036
+ "epoch": 0.03963085495385687,
4037
+ "grad_norm": 0.7171745896339417,
4038
+ "learning_rate": 7.369511718668418e-05,
4039
+ "loss": 0.941,
4040
+ "step": 569
4041
+ },
4042
+ {
4043
+ "epoch": 0.03970050496256312,
4044
+ "grad_norm": 0.6474679708480835,
4045
+ "learning_rate": 7.337821442628805e-05,
4046
+ "loss": 0.8192,
4047
+ "step": 570
4048
+ },
4049
+ {
4050
+ "epoch": 0.03977015497126937,
4051
+ "grad_norm": 0.7054280042648315,
4052
+ "learning_rate": 7.306159915457342e-05,
4053
+ "loss": 0.6327,
4054
+ "step": 571
4055
+ },
4056
+ {
4057
+ "epoch": 0.03983980497997562,
4058
+ "grad_norm": 0.7624709606170654,
4059
+ "learning_rate": 7.274527479066883e-05,
4060
+ "loss": 0.8132,
4061
+ "step": 572
4062
+ },
4063
+ {
4064
+ "epoch": 0.039909454988681876,
4065
+ "grad_norm": 0.6930527687072754,
4066
+ "learning_rate": 7.242924475056127e-05,
4067
+ "loss": 0.8482,
4068
+ "step": 573
4069
+ },
4070
+ {
4071
+ "epoch": 0.039979104997388125,
4072
+ "grad_norm": 0.6599513292312622,
4073
+ "learning_rate": 7.211351244705946e-05,
4074
+ "loss": 0.6787,
4075
+ "step": 574
4076
+ },
4077
+ {
4078
+ "epoch": 0.04004875500609437,
4079
+ "grad_norm": 0.7311400771141052,
4080
+ "learning_rate": 7.179808128975674e-05,
4081
+ "loss": 0.9747,
4082
+ "step": 575
4083
+ },
4084
+ {
4085
+ "epoch": 0.04011840501480063,
4086
+ "grad_norm": 0.615138828754425,
4087
+ "learning_rate": 7.148295468499438e-05,
4088
+ "loss": 0.9404,
4089
+ "step": 576
4090
+ },
4091
+ {
4092
+ "epoch": 0.04018805502350688,
4093
+ "grad_norm": 0.6401761174201965,
4094
+ "learning_rate": 7.116813603582482e-05,
4095
+ "loss": 0.4915,
4096
+ "step": 577
4097
+ },
4098
+ {
4099
+ "epoch": 0.040257705032213126,
4100
+ "grad_norm": 0.6191440224647522,
4101
+ "learning_rate": 7.08536287419749e-05,
4102
+ "loss": 0.6031,
4103
+ "step": 578
4104
+ },
4105
+ {
4106
+ "epoch": 0.04032735504091938,
4107
+ "grad_norm": 0.5751050710678101,
4108
+ "learning_rate": 7.053943619980907e-05,
4109
+ "loss": 0.8371,
4110
+ "step": 579
4111
+ },
4112
+ {
4113
+ "epoch": 0.04039700504962563,
4114
+ "grad_norm": 0.518409252166748,
4115
+ "learning_rate": 7.022556180229285e-05,
4116
+ "loss": 0.4333,
4117
+ "step": 580
4118
+ },
4119
+ {
4120
+ "epoch": 0.040466655058331885,
4121
+ "grad_norm": 0.5712803602218628,
4122
+ "learning_rate": 6.991200893895608e-05,
4123
+ "loss": 0.796,
4124
+ "step": 581
4125
+ },
4126
+ {
4127
+ "epoch": 0.040536305067038134,
4128
+ "grad_norm": 0.661482036113739,
4129
+ "learning_rate": 6.959878099585635e-05,
4130
+ "loss": 0.8585,
4131
+ "step": 582
4132
+ },
4133
+ {
4134
+ "epoch": 0.04060595507574438,
4135
+ "grad_norm": 0.6602011322975159,
4136
+ "learning_rate": 6.92858813555424e-05,
4137
+ "loss": 0.9474,
4138
+ "step": 583
4139
+ },
4140
+ {
4141
+ "epoch": 0.04067560508445064,
4142
+ "grad_norm": 0.5971815586090088,
4143
+ "learning_rate": 6.897331339701776e-05,
4144
+ "loss": 0.7689,
4145
+ "step": 584
4146
+ },
4147
+ {
4148
+ "epoch": 0.040745255093156886,
4149
+ "grad_norm": 0.571740448474884,
4150
+ "learning_rate": 6.866108049570397e-05,
4151
+ "loss": 0.9023,
4152
+ "step": 585
4153
+ },
4154
+ {
4155
+ "epoch": 0.040814905101863135,
4156
+ "grad_norm": 0.6928638219833374,
4157
+ "learning_rate": 6.834918602340438e-05,
4158
+ "loss": 0.8899,
4159
+ "step": 586
4160
+ },
4161
+ {
4162
+ "epoch": 0.04088455511056939,
4163
+ "grad_norm": 0.6468199491500854,
4164
+ "learning_rate": 6.803763334826763e-05,
4165
+ "loss": 0.8841,
4166
+ "step": 587
4167
+ },
4168
+ {
4169
+ "epoch": 0.04095420511927564,
4170
+ "grad_norm": 0.6777251362800598,
4171
+ "learning_rate": 6.772642583475126e-05,
4172
+ "loss": 0.8491,
4173
+ "step": 588
4174
+ },
4175
+ {
4176
+ "epoch": 0.041023855127981894,
4177
+ "grad_norm": 0.5866687297821045,
4178
+ "learning_rate": 6.741556684358545e-05,
4179
+ "loss": 0.6435,
4180
+ "step": 589
4181
+ },
4182
+ {
4183
+ "epoch": 0.04109350513668814,
4184
+ "grad_norm": 0.5522730350494385,
4185
+ "learning_rate": 6.710505973173664e-05,
4186
+ "loss": 0.9188,
4187
+ "step": 590
4188
+ },
4189
+ {
4190
+ "epoch": 0.04116315514539439,
4191
+ "grad_norm": 0.7048250436782837,
4192
+ "learning_rate": 6.679490785237137e-05,
4193
+ "loss": 0.911,
4194
+ "step": 591
4195
+ },
4196
+ {
4197
+ "epoch": 0.041232805154100646,
4198
+ "grad_norm": 0.849677324295044,
4199
+ "learning_rate": 6.648511455482003e-05,
4200
+ "loss": 1.0408,
4201
+ "step": 592
4202
+ },
4203
+ {
4204
+ "epoch": 0.041302455162806895,
4205
+ "grad_norm": 0.653287947177887,
4206
+ "learning_rate": 6.617568318454059e-05,
4207
+ "loss": 1.187,
4208
+ "step": 593
4209
+ },
4210
+ {
4211
+ "epoch": 0.04137210517151314,
4212
+ "grad_norm": 0.5278560519218445,
4213
+ "learning_rate": 6.586661708308272e-05,
4214
+ "loss": 0.8789,
4215
+ "step": 594
4216
+ },
4217
+ {
4218
+ "epoch": 0.0414417551802194,
4219
+ "grad_norm": 0.7803817987442017,
4220
+ "learning_rate": 6.555791958805147e-05,
4221
+ "loss": 0.8788,
4222
+ "step": 595
4223
+ },
4224
+ {
4225
+ "epoch": 0.04151140518892565,
4226
+ "grad_norm": 0.6425774097442627,
4227
+ "learning_rate": 6.524959403307125e-05,
4228
+ "loss": 0.9296,
4229
+ "step": 596
4230
+ },
4231
+ {
4232
+ "epoch": 0.0415810551976319,
4233
+ "grad_norm": 0.5787883400917053,
4234
+ "learning_rate": 6.494164374775e-05,
4235
+ "loss": 1.0127,
4236
+ "step": 597
4237
+ },
4238
+ {
4239
+ "epoch": 0.04165070520633815,
4240
+ "grad_norm": 0.5686517357826233,
4241
+ "learning_rate": 6.463407205764305e-05,
4242
+ "loss": 0.7869,
4243
+ "step": 598
4244
+ },
4245
+ {
4246
+ "epoch": 0.0417203552150444,
4247
+ "grad_norm": 0.5126462578773499,
4248
+ "learning_rate": 6.43268822842173e-05,
4249
+ "loss": 1.2029,
4250
+ "step": 599
4251
+ },
4252
+ {
4253
+ "epoch": 0.041790005223750655,
4254
+ "grad_norm": 0.5618976950645447,
4255
+ "learning_rate": 6.402007774481536e-05,
4256
+ "loss": 0.5725,
4257
+ "step": 600
4258
+ },
4259
+ {
4260
+ "epoch": 0.041790005223750655,
4261
+ "eval_loss": 0.7635987401008606,
4262
+ "eval_runtime": 701.6781,
4263
+ "eval_samples_per_second": 7.126,
4264
+ "eval_steps_per_second": 1.781,
4265
+ "step": 600
4266
  }
4267
  ],
4268
  "logging_steps": 1,
 
4291
  "attributes": {}
4292
  }
4293
  },
4294
+ "total_flos": 1.6329142901994947e+18,
4295
  "train_batch_size": 4,
4296
  "trial_name": null,
4297
  "trial_params": null