romainnn commited on
Commit
5020091
·
verified ·
1 Parent(s): aeaf645

Training in progress, step 200, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a8de489ef3b321cb1ae9af64ca58c6dbec7a1d4128393e4857a2c221f6ea076a
3
  size 159967880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19706cf0c3c9917c9184ea6dd896ae403cff3f60135017a3817469095368ba3d
3
  size 159967880
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9d3a26ad7e055007417a4ed97af3d431946811ebd851b248dab10f8f6205bc8a
3
  size 81730196
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:599285b80cf8578bfb241dec5482dcbc643d2bbff8ed837d8b8bde37f4468e55
3
  size 81730196
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d0331b567cfc16b8ba9714182b6d1197f7e53379d529dce315a5110d4c146f3d
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c4c574b02d176cf65e372b568790452567f681562fac58ee8b8e4f350e30a8d
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c7d9ae41e6d249905bc3fac6a098c95704c803904081b2ec9bf85cc2bde9d9e0
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b0bc2caca23d108aa0e5bff0f5c61c148bd9877bb3fac62655062c7ee8a3560e
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 1.2061141729354858,
3
- "best_model_checkpoint": "miner_id_24/checkpoint-100",
4
- "epoch": 0.006965000870625109,
5
  "eval_steps": 100,
6
- "global_step": 100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -723,6 +723,714 @@
723
  "eval_samples_per_second": 7.146,
724
  "eval_steps_per_second": 1.786,
725
  "step": 100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
726
  }
727
  ],
728
  "logging_steps": 1,
@@ -751,7 +1459,7 @@
751
  "attributes": {}
752
  }
753
  },
754
- "total_flos": 2.7358071474880512e+17,
755
  "train_batch_size": 4,
756
  "trial_name": null,
757
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.993212103843689,
3
+ "best_model_checkpoint": "miner_id_24/checkpoint-200",
4
+ "epoch": 0.013930001741250218,
5
  "eval_steps": 100,
6
+ "global_step": 200,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
723
  "eval_samples_per_second": 7.146,
724
  "eval_steps_per_second": 1.786,
725
  "step": 100
726
+ },
727
+ {
728
+ "epoch": 0.00703465087933136,
729
+ "grad_norm": 0.7039173245429993,
730
+ "learning_rate": 0.00019556189188465702,
731
+ "loss": 1.4391,
732
+ "step": 101
733
+ },
734
+ {
735
+ "epoch": 0.007104300888037611,
736
+ "grad_norm": 0.8350788354873657,
737
+ "learning_rate": 0.00019546456321594376,
738
+ "loss": 1.1431,
739
+ "step": 102
740
+ },
741
+ {
742
+ "epoch": 0.0071739508967438624,
743
+ "grad_norm": 0.6535744667053223,
744
+ "learning_rate": 0.0001953662036253438,
745
+ "loss": 1.296,
746
+ "step": 103
747
+ },
748
+ {
749
+ "epoch": 0.0072436009054501135,
750
+ "grad_norm": 0.7496301531791687,
751
+ "learning_rate": 0.00019526681417504258,
752
+ "loss": 1.311,
753
+ "step": 104
754
+ },
755
+ {
756
+ "epoch": 0.007313250914156365,
757
+ "grad_norm": 0.7061691880226135,
758
+ "learning_rate": 0.0001951663959383468,
759
+ "loss": 1.3601,
760
+ "step": 105
761
+ },
762
+ {
763
+ "epoch": 0.007382900922862616,
764
+ "grad_norm": 0.8221380114555359,
765
+ "learning_rate": 0.00019506494999967298,
766
+ "loss": 1.3149,
767
+ "step": 106
768
+ },
769
+ {
770
+ "epoch": 0.007452550931568867,
771
+ "grad_norm": 0.9544386267662048,
772
+ "learning_rate": 0.000194962477454536,
773
+ "loss": 1.2967,
774
+ "step": 107
775
+ },
776
+ {
777
+ "epoch": 0.007522200940275118,
778
+ "grad_norm": 0.8127594590187073,
779
+ "learning_rate": 0.00019485897940953688,
780
+ "loss": 1.4015,
781
+ "step": 108
782
+ },
783
+ {
784
+ "epoch": 0.007591850948981369,
785
+ "grad_norm": 0.7376645803451538,
786
+ "learning_rate": 0.0001947544569823511,
787
+ "loss": 1.4958,
788
+ "step": 109
789
+ },
790
+ {
791
+ "epoch": 0.00766150095768762,
792
+ "grad_norm": 0.6602767705917358,
793
+ "learning_rate": 0.00019464891130171647,
794
+ "loss": 1.3593,
795
+ "step": 110
796
+ },
797
+ {
798
+ "epoch": 0.007731150966393871,
799
+ "grad_norm": 0.9318028092384338,
800
+ "learning_rate": 0.0001945423435074208,
801
+ "loss": 1.0125,
802
+ "step": 111
803
+ },
804
+ {
805
+ "epoch": 0.007800800975100122,
806
+ "grad_norm": 0.7048940062522888,
807
+ "learning_rate": 0.00019443475475028983,
808
+ "loss": 1.4342,
809
+ "step": 112
810
+ },
811
+ {
812
+ "epoch": 0.007870450983806372,
813
+ "grad_norm": 0.9778817892074585,
814
+ "learning_rate": 0.00019432614619217459,
815
+ "loss": 1.0368,
816
+ "step": 113
817
+ },
818
+ {
819
+ "epoch": 0.007940100992512624,
820
+ "grad_norm": 0.808047890663147,
821
+ "learning_rate": 0.000194216519005939,
822
+ "loss": 1.105,
823
+ "step": 114
824
+ },
825
+ {
826
+ "epoch": 0.008009751001218875,
827
+ "grad_norm": 0.7996501326560974,
828
+ "learning_rate": 0.0001941058743754471,
829
+ "loss": 1.1383,
830
+ "step": 115
831
+ },
832
+ {
833
+ "epoch": 0.008079401009925127,
834
+ "grad_norm": 1.0752230882644653,
835
+ "learning_rate": 0.00019399421349555035,
836
+ "loss": 1.3508,
837
+ "step": 116
838
+ },
839
+ {
840
+ "epoch": 0.008149051018631377,
841
+ "grad_norm": 0.7151166200637817,
842
+ "learning_rate": 0.00019388153757207471,
843
+ "loss": 1.4086,
844
+ "step": 117
845
+ },
846
+ {
847
+ "epoch": 0.008218701027337629,
848
+ "grad_norm": 0.7622511386871338,
849
+ "learning_rate": 0.00019376784782180746,
850
+ "loss": 1.1942,
851
+ "step": 118
852
+ },
853
+ {
854
+ "epoch": 0.008288351036043879,
855
+ "grad_norm": 0.6896407008171082,
856
+ "learning_rate": 0.0001936531454724844,
857
+ "loss": 1.2571,
858
+ "step": 119
859
+ },
860
+ {
861
+ "epoch": 0.008358001044750131,
862
+ "grad_norm": 0.7991106510162354,
863
+ "learning_rate": 0.00019353743176277622,
864
+ "loss": 1.2531,
865
+ "step": 120
866
+ },
867
+ {
868
+ "epoch": 0.008427651053456381,
869
+ "grad_norm": 0.8540248870849609,
870
+ "learning_rate": 0.00019342070794227536,
871
+ "loss": 1.223,
872
+ "step": 121
873
+ },
874
+ {
875
+ "epoch": 0.008497301062162633,
876
+ "grad_norm": 0.8329891562461853,
877
+ "learning_rate": 0.00019330297527148246,
878
+ "loss": 0.9099,
879
+ "step": 122
880
+ },
881
+ {
882
+ "epoch": 0.008566951070868883,
883
+ "grad_norm": 0.7838830351829529,
884
+ "learning_rate": 0.00019318423502179272,
885
+ "loss": 1.3098,
886
+ "step": 123
887
+ },
888
+ {
889
+ "epoch": 0.008636601079575135,
890
+ "grad_norm": 0.7665576338768005,
891
+ "learning_rate": 0.00019306448847548216,
892
+ "loss": 1.3633,
893
+ "step": 124
894
+ },
895
+ {
896
+ "epoch": 0.008706251088281386,
897
+ "grad_norm": 0.7157841324806213,
898
+ "learning_rate": 0.00019294373692569383,
899
+ "loss": 0.9222,
900
+ "step": 125
901
+ },
902
+ {
903
+ "epoch": 0.008775901096987638,
904
+ "grad_norm": 0.944957971572876,
905
+ "learning_rate": 0.0001928219816764238,
906
+ "loss": 1.0901,
907
+ "step": 126
908
+ },
909
+ {
910
+ "epoch": 0.008845551105693888,
911
+ "grad_norm": 0.636736273765564,
912
+ "learning_rate": 0.0001926992240425071,
913
+ "loss": 1.3484,
914
+ "step": 127
915
+ },
916
+ {
917
+ "epoch": 0.00891520111440014,
918
+ "grad_norm": 0.6209918260574341,
919
+ "learning_rate": 0.0001925754653496035,
920
+ "loss": 1.3551,
921
+ "step": 128
922
+ },
923
+ {
924
+ "epoch": 0.00898485112310639,
925
+ "grad_norm": 0.7056594491004944,
926
+ "learning_rate": 0.00019245070693418322,
927
+ "loss": 1.4229,
928
+ "step": 129
929
+ },
930
+ {
931
+ "epoch": 0.009054501131812642,
932
+ "grad_norm": 0.7279839515686035,
933
+ "learning_rate": 0.00019232495014351246,
934
+ "loss": 1.0699,
935
+ "step": 130
936
+ },
937
+ {
938
+ "epoch": 0.009124151140518892,
939
+ "grad_norm": 0.6324151754379272,
940
+ "learning_rate": 0.00019219819633563891,
941
+ "loss": 1.3833,
942
+ "step": 131
943
+ },
944
+ {
945
+ "epoch": 0.009193801149225144,
946
+ "grad_norm": 0.7449592351913452,
947
+ "learning_rate": 0.00019207044687937703,
948
+ "loss": 1.2067,
949
+ "step": 132
950
+ },
951
+ {
952
+ "epoch": 0.009263451157931394,
953
+ "grad_norm": 0.939274787902832,
954
+ "learning_rate": 0.0001919417031542933,
955
+ "loss": 1.3229,
956
+ "step": 133
957
+ },
958
+ {
959
+ "epoch": 0.009333101166637646,
960
+ "grad_norm": 0.8192336559295654,
961
+ "learning_rate": 0.00019181196655069127,
962
+ "loss": 1.1575,
963
+ "step": 134
964
+ },
965
+ {
966
+ "epoch": 0.009402751175343897,
967
+ "grad_norm": 0.7507984638214111,
968
+ "learning_rate": 0.00019168123846959666,
969
+ "loss": 1.0461,
970
+ "step": 135
971
+ },
972
+ {
973
+ "epoch": 0.009472401184050148,
974
+ "grad_norm": 0.6593666672706604,
975
+ "learning_rate": 0.00019154952032274206,
976
+ "loss": 1.3806,
977
+ "step": 136
978
+ },
979
+ {
980
+ "epoch": 0.009542051192756399,
981
+ "grad_norm": 0.6475424766540527,
982
+ "learning_rate": 0.00019141681353255184,
983
+ "loss": 0.9218,
984
+ "step": 137
985
+ },
986
+ {
987
+ "epoch": 0.00961170120146265,
988
+ "grad_norm": 0.7746126651763916,
989
+ "learning_rate": 0.00019128311953212678,
990
+ "loss": 0.8967,
991
+ "step": 138
992
+ },
993
+ {
994
+ "epoch": 0.009681351210168901,
995
+ "grad_norm": 0.7104780673980713,
996
+ "learning_rate": 0.00019114843976522842,
997
+ "loss": 1.1855,
998
+ "step": 139
999
+ },
1000
+ {
1001
+ "epoch": 0.009751001218875153,
1002
+ "grad_norm": 0.597457230091095,
1003
+ "learning_rate": 0.00019101277568626374,
1004
+ "loss": 1.0809,
1005
+ "step": 140
1006
+ },
1007
+ {
1008
+ "epoch": 0.009820651227581403,
1009
+ "grad_norm": 0.8071316480636597,
1010
+ "learning_rate": 0.00019087612876026908,
1011
+ "loss": 1.0129,
1012
+ "step": 141
1013
+ },
1014
+ {
1015
+ "epoch": 0.009890301236287655,
1016
+ "grad_norm": 0.8741605877876282,
1017
+ "learning_rate": 0.00019073850046289484,
1018
+ "loss": 0.8784,
1019
+ "step": 142
1020
+ },
1021
+ {
1022
+ "epoch": 0.009959951244993905,
1023
+ "grad_norm": 0.7503401637077332,
1024
+ "learning_rate": 0.00019059989228038902,
1025
+ "loss": 1.1498,
1026
+ "step": 143
1027
+ },
1028
+ {
1029
+ "epoch": 0.010029601253700157,
1030
+ "grad_norm": 0.7068141102790833,
1031
+ "learning_rate": 0.0001904603057095815,
1032
+ "loss": 1.2644,
1033
+ "step": 144
1034
+ },
1035
+ {
1036
+ "epoch": 0.010099251262406407,
1037
+ "grad_norm": 0.7954654097557068,
1038
+ "learning_rate": 0.0001903197422578678,
1039
+ "loss": 1.1108,
1040
+ "step": 145
1041
+ },
1042
+ {
1043
+ "epoch": 0.01016890127111266,
1044
+ "grad_norm": 0.7548302412033081,
1045
+ "learning_rate": 0.0001901782034431927,
1046
+ "loss": 0.9177,
1047
+ "step": 146
1048
+ },
1049
+ {
1050
+ "epoch": 0.01023855127981891,
1051
+ "grad_norm": 0.7617766261100769,
1052
+ "learning_rate": 0.00019003569079403395,
1053
+ "loss": 1.256,
1054
+ "step": 147
1055
+ },
1056
+ {
1057
+ "epoch": 0.010308201288525162,
1058
+ "grad_norm": 0.7205716967582703,
1059
+ "learning_rate": 0.00018989220584938573,
1060
+ "loss": 1.3767,
1061
+ "step": 148
1062
+ },
1063
+ {
1064
+ "epoch": 0.010377851297231412,
1065
+ "grad_norm": 0.6221201419830322,
1066
+ "learning_rate": 0.00018974775015874213,
1067
+ "loss": 1.3329,
1068
+ "step": 149
1069
+ },
1070
+ {
1071
+ "epoch": 0.010447501305937664,
1072
+ "grad_norm": 0.565428614616394,
1073
+ "learning_rate": 0.00018960232528208022,
1074
+ "loss": 1.1155,
1075
+ "step": 150
1076
+ },
1077
+ {
1078
+ "epoch": 0.010517151314643914,
1079
+ "grad_norm": 0.7672913074493408,
1080
+ "learning_rate": 0.00018945593278984333,
1081
+ "loss": 0.9654,
1082
+ "step": 151
1083
+ },
1084
+ {
1085
+ "epoch": 0.010586801323350166,
1086
+ "grad_norm": 0.737074077129364,
1087
+ "learning_rate": 0.00018930857426292412,
1088
+ "loss": 1.0644,
1089
+ "step": 152
1090
+ },
1091
+ {
1092
+ "epoch": 0.010656451332056416,
1093
+ "grad_norm": 0.6545393466949463,
1094
+ "learning_rate": 0.0001891602512926474,
1095
+ "loss": 1.2058,
1096
+ "step": 153
1097
+ },
1098
+ {
1099
+ "epoch": 0.010726101340762668,
1100
+ "grad_norm": 0.8019453287124634,
1101
+ "learning_rate": 0.00018901096548075305,
1102
+ "loss": 1.3134,
1103
+ "step": 154
1104
+ },
1105
+ {
1106
+ "epoch": 0.010795751349468918,
1107
+ "grad_norm": 0.8307440876960754,
1108
+ "learning_rate": 0.00018886071843937866,
1109
+ "loss": 1.152,
1110
+ "step": 155
1111
+ },
1112
+ {
1113
+ "epoch": 0.01086540135817517,
1114
+ "grad_norm": 0.8050329089164734,
1115
+ "learning_rate": 0.00018870951179104212,
1116
+ "loss": 0.9473,
1117
+ "step": 156
1118
+ },
1119
+ {
1120
+ "epoch": 0.01093505136688142,
1121
+ "grad_norm": 0.7510560154914856,
1122
+ "learning_rate": 0.00018855734716862417,
1123
+ "loss": 1.2265,
1124
+ "step": 157
1125
+ },
1126
+ {
1127
+ "epoch": 0.011004701375587672,
1128
+ "grad_norm": 0.7653977274894714,
1129
+ "learning_rate": 0.00018840422621535066,
1130
+ "loss": 1.3356,
1131
+ "step": 158
1132
+ },
1133
+ {
1134
+ "epoch": 0.011074351384293923,
1135
+ "grad_norm": 0.7661434412002563,
1136
+ "learning_rate": 0.00018825015058477481,
1137
+ "loss": 0.9601,
1138
+ "step": 159
1139
+ },
1140
+ {
1141
+ "epoch": 0.011144001393000175,
1142
+ "grad_norm": 0.7829368114471436,
1143
+ "learning_rate": 0.00018809512194075957,
1144
+ "loss": 1.0675,
1145
+ "step": 160
1146
+ },
1147
+ {
1148
+ "epoch": 0.011213651401706425,
1149
+ "grad_norm": 0.6673858761787415,
1150
+ "learning_rate": 0.00018793914195745933,
1151
+ "loss": 1.4312,
1152
+ "step": 161
1153
+ },
1154
+ {
1155
+ "epoch": 0.011283301410412677,
1156
+ "grad_norm": 0.8060672879219055,
1157
+ "learning_rate": 0.00018778221231930203,
1158
+ "loss": 1.0241,
1159
+ "step": 162
1160
+ },
1161
+ {
1162
+ "epoch": 0.011352951419118927,
1163
+ "grad_norm": 1.0137969255447388,
1164
+ "learning_rate": 0.00018762433472097097,
1165
+ "loss": 1.1867,
1166
+ "step": 163
1167
+ },
1168
+ {
1169
+ "epoch": 0.011422601427825179,
1170
+ "grad_norm": 0.9313655495643616,
1171
+ "learning_rate": 0.0001874655108673864,
1172
+ "loss": 1.3046,
1173
+ "step": 164
1174
+ },
1175
+ {
1176
+ "epoch": 0.01149225143653143,
1177
+ "grad_norm": 0.9493317008018494,
1178
+ "learning_rate": 0.00018730574247368732,
1179
+ "loss": 1.1123,
1180
+ "step": 165
1181
+ },
1182
+ {
1183
+ "epoch": 0.011561901445237681,
1184
+ "grad_norm": 0.8069944977760315,
1185
+ "learning_rate": 0.0001871450312652126,
1186
+ "loss": 1.0592,
1187
+ "step": 166
1188
+ },
1189
+ {
1190
+ "epoch": 0.011631551453943931,
1191
+ "grad_norm": 0.6559287905693054,
1192
+ "learning_rate": 0.00018698337897748283,
1193
+ "loss": 1.2388,
1194
+ "step": 167
1195
+ },
1196
+ {
1197
+ "epoch": 0.011701201462650183,
1198
+ "grad_norm": 0.650059700012207,
1199
+ "learning_rate": 0.0001868207873561811,
1200
+ "loss": 0.9891,
1201
+ "step": 168
1202
+ },
1203
+ {
1204
+ "epoch": 0.011770851471356434,
1205
+ "grad_norm": 0.6247674822807312,
1206
+ "learning_rate": 0.00018665725815713443,
1207
+ "loss": 1.2925,
1208
+ "step": 169
1209
+ },
1210
+ {
1211
+ "epoch": 0.011840501480062686,
1212
+ "grad_norm": 0.7453685402870178,
1213
+ "learning_rate": 0.00018649279314629483,
1214
+ "loss": 1.06,
1215
+ "step": 170
1216
+ },
1217
+ {
1218
+ "epoch": 0.011910151488768936,
1219
+ "grad_norm": 0.826835572719574,
1220
+ "learning_rate": 0.00018632739409972003,
1221
+ "loss": 0.9637,
1222
+ "step": 171
1223
+ },
1224
+ {
1225
+ "epoch": 0.011979801497475188,
1226
+ "grad_norm": 0.7538785338401794,
1227
+ "learning_rate": 0.00018616106280355444,
1228
+ "loss": 1.0126,
1229
+ "step": 172
1230
+ },
1231
+ {
1232
+ "epoch": 0.012049451506181438,
1233
+ "grad_norm": 0.8348299264907837,
1234
+ "learning_rate": 0.00018599380105400982,
1235
+ "loss": 0.988,
1236
+ "step": 173
1237
+ },
1238
+ {
1239
+ "epoch": 0.01211910151488769,
1240
+ "grad_norm": 0.8298357725143433,
1241
+ "learning_rate": 0.00018582561065734604,
1242
+ "loss": 1.0608,
1243
+ "step": 174
1244
+ },
1245
+ {
1246
+ "epoch": 0.01218875152359394,
1247
+ "grad_norm": 0.6961440443992615,
1248
+ "learning_rate": 0.00018565649342985118,
1249
+ "loss": 1.1564,
1250
+ "step": 175
1251
+ },
1252
+ {
1253
+ "epoch": 0.012258401532300192,
1254
+ "grad_norm": 0.664256751537323,
1255
+ "learning_rate": 0.00018548645119782238,
1256
+ "loss": 1.1865,
1257
+ "step": 176
1258
+ },
1259
+ {
1260
+ "epoch": 0.012328051541006442,
1261
+ "grad_norm": 0.7857444882392883,
1262
+ "learning_rate": 0.0001853154857975458,
1263
+ "loss": 0.9903,
1264
+ "step": 177
1265
+ },
1266
+ {
1267
+ "epoch": 0.012397701549712694,
1268
+ "grad_norm": 0.758602499961853,
1269
+ "learning_rate": 0.0001851435990752769,
1270
+ "loss": 1.3456,
1271
+ "step": 178
1272
+ },
1273
+ {
1274
+ "epoch": 0.012467351558418945,
1275
+ "grad_norm": 0.768666684627533,
1276
+ "learning_rate": 0.0001849707928872206,
1277
+ "loss": 0.9773,
1278
+ "step": 179
1279
+ },
1280
+ {
1281
+ "epoch": 0.012537001567125197,
1282
+ "grad_norm": 0.8674852848052979,
1283
+ "learning_rate": 0.00018479706909951094,
1284
+ "loss": 1.0203,
1285
+ "step": 180
1286
+ },
1287
+ {
1288
+ "epoch": 0.012606651575831447,
1289
+ "grad_norm": 0.6384921669960022,
1290
+ "learning_rate": 0.0001846224295881913,
1291
+ "loss": 1.1004,
1292
+ "step": 181
1293
+ },
1294
+ {
1295
+ "epoch": 0.012676301584537699,
1296
+ "grad_norm": 0.6848528981208801,
1297
+ "learning_rate": 0.00018444687623919386,
1298
+ "loss": 1.0699,
1299
+ "step": 182
1300
+ },
1301
+ {
1302
+ "epoch": 0.012745951593243949,
1303
+ "grad_norm": 0.6943731307983398,
1304
+ "learning_rate": 0.00018427041094831937,
1305
+ "loss": 1.1812,
1306
+ "step": 183
1307
+ },
1308
+ {
1309
+ "epoch": 0.012815601601950201,
1310
+ "grad_norm": 1.0284762382507324,
1311
+ "learning_rate": 0.00018409303562121662,
1312
+ "loss": 1.1307,
1313
+ "step": 184
1314
+ },
1315
+ {
1316
+ "epoch": 0.012885251610656451,
1317
+ "grad_norm": 0.7977420091629028,
1318
+ "learning_rate": 0.00018391475217336193,
1319
+ "loss": 1.0772,
1320
+ "step": 185
1321
+ },
1322
+ {
1323
+ "epoch": 0.012954901619362703,
1324
+ "grad_norm": 0.678799569606781,
1325
+ "learning_rate": 0.0001837355625300383,
1326
+ "loss": 1.1816,
1327
+ "step": 186
1328
+ },
1329
+ {
1330
+ "epoch": 0.013024551628068953,
1331
+ "grad_norm": 0.7933035492897034,
1332
+ "learning_rate": 0.00018355546862631493,
1333
+ "loss": 1.2014,
1334
+ "step": 187
1335
+ },
1336
+ {
1337
+ "epoch": 0.013094201636775205,
1338
+ "grad_norm": 0.7373278737068176,
1339
+ "learning_rate": 0.00018337447240702594,
1340
+ "loss": 0.9163,
1341
+ "step": 188
1342
+ },
1343
+ {
1344
+ "epoch": 0.013163851645481455,
1345
+ "grad_norm": 0.7306934595108032,
1346
+ "learning_rate": 0.00018319257582674964,
1347
+ "loss": 0.8467,
1348
+ "step": 189
1349
+ },
1350
+ {
1351
+ "epoch": 0.013233501654187707,
1352
+ "grad_norm": 0.6722437739372253,
1353
+ "learning_rate": 0.00018300978084978735,
1354
+ "loss": 1.1145,
1355
+ "step": 190
1356
+ },
1357
+ {
1358
+ "epoch": 0.013303151662893958,
1359
+ "grad_norm": 0.8375574350357056,
1360
+ "learning_rate": 0.00018282608945014217,
1361
+ "loss": 0.8763,
1362
+ "step": 191
1363
+ },
1364
+ {
1365
+ "epoch": 0.01337280167160021,
1366
+ "grad_norm": 0.6876571774482727,
1367
+ "learning_rate": 0.0001826415036114976,
1368
+ "loss": 1.3694,
1369
+ "step": 192
1370
+ },
1371
+ {
1372
+ "epoch": 0.01344245168030646,
1373
+ "grad_norm": 0.5936222076416016,
1374
+ "learning_rate": 0.0001824560253271963,
1375
+ "loss": 1.4071,
1376
+ "step": 193
1377
+ },
1378
+ {
1379
+ "epoch": 0.013512101689012712,
1380
+ "grad_norm": 0.6679614782333374,
1381
+ "learning_rate": 0.00018226965660021836,
1382
+ "loss": 0.8098,
1383
+ "step": 194
1384
+ },
1385
+ {
1386
+ "epoch": 0.013581751697718962,
1387
+ "grad_norm": 0.8226193189620972,
1388
+ "learning_rate": 0.00018208239944315978,
1389
+ "loss": 0.6594,
1390
+ "step": 195
1391
+ },
1392
+ {
1393
+ "epoch": 0.013651401706425214,
1394
+ "grad_norm": 0.8376763463020325,
1395
+ "learning_rate": 0.0001818942558782108,
1396
+ "loss": 1.0417,
1397
+ "step": 196
1398
+ },
1399
+ {
1400
+ "epoch": 0.013721051715131464,
1401
+ "grad_norm": 0.773747444152832,
1402
+ "learning_rate": 0.00018170522793713387,
1403
+ "loss": 0.7496,
1404
+ "step": 197
1405
+ },
1406
+ {
1407
+ "epoch": 0.013790701723837716,
1408
+ "grad_norm": 0.8213014006614685,
1409
+ "learning_rate": 0.00018151531766124186,
1410
+ "loss": 0.842,
1411
+ "step": 198
1412
+ },
1413
+ {
1414
+ "epoch": 0.013860351732543966,
1415
+ "grad_norm": 0.6993326544761658,
1416
+ "learning_rate": 0.000181324527101376,
1417
+ "loss": 1.1651,
1418
+ "step": 199
1419
+ },
1420
+ {
1421
+ "epoch": 0.013930001741250218,
1422
+ "grad_norm": 0.550957977771759,
1423
+ "learning_rate": 0.00018113285831788365,
1424
+ "loss": 1.2762,
1425
+ "step": 200
1426
+ },
1427
+ {
1428
+ "epoch": 0.013930001741250218,
1429
+ "eval_loss": 0.993212103843689,
1430
+ "eval_runtime": 699.7494,
1431
+ "eval_samples_per_second": 7.145,
1432
+ "eval_steps_per_second": 1.786,
1433
+ "step": 200
1434
  }
1435
  ],
1436
  "logging_steps": 1,
 
1459
  "attributes": {}
1460
  }
1461
  },
1462
+ "total_flos": 5.4518373758376346e+17,
1463
  "train_batch_size": 4,
1464
  "trial_name": null,
1465
  "trial_params": null