souging commited on
Commit
8952f86
·
verified ·
1 Parent(s): 097f182

Training in progress, epoch 2, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ae49f66fa40de83d0480645b761e8a753be0e28dae3d7766c6d3f8a1baf76758
3
  size 800116456
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97891a60ef32278715fe2fd622467a12763337c4f29242e0f47e6fa912ad3bfe
3
  size 800116456
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3edba13ad09666a36a27cebd58dda9e758e7e87ddd5bef0b427617e52e33efac
3
  size 406743860
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f4c7a6cc33054e1b8f261b1bf71a365bc3b95974dbb2778025c3addcdbbc007
3
  size 406743860
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:21baf4fcb4a38e73544f06f6ef7f7a2d5294a4ec46aaecfe18f9b11ab06b0be2
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:863886f073e9c5c689c6ec3b1475c24da88e3fc752e2e629afed4a2fd70f22d2
3
  size 15984
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:99ac34f10b5b4b30f9a14a4b42e76496b127d0b2a59dc004d710f643389b83f4
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4274bf3f4a7c08973ea99b74ce6035a79abe0939430e685f8d3ba7036343982
3
  size 15984
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:38b88187ca347822e8568870ea86a324e54f63d62d2f024d8758ac5294d40a4a
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:301fa96e37e1ba2ee75f0fbc01f26e4f12cd1597463ce19b79f6f19cd894f004
3
  size 15984
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:77fbe34d05fd83a2009eafa4d761d1a348b27a341023148bb5b1a192c63511e7
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:05b1fde6899045457175e8c52d857720181b80762222d8f14a4261f8c0678492
3
  size 15984
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cef45ca40916c741f85a2a224b408d1dedf2b7d0472602a0f734da65b3437ad6
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ee6337bf076a6e596f313be0012d284f2fba533e1d8c79adb3b2cd349de8f1f
3
  size 15984
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0987883926c20bb8f52b158ac1c13ef4cc918beb36cd85f83e47464909ebac27
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fed1407b50de8e21d020387b421a3eeaf16b75cb4dc805daa58a6a4650647dfe
3
  size 15984
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6225dc3a627ae28f20a3d380b0ed35a7dd891b3f65db0e8bbd21c856decaf133
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4ea5f0174438e143d41f6c7ed1b169273deb7031a4afb3e805dfd389ef07e74
3
  size 15984
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3f877841f36b3220621f0ed636d6be2da52fa59ed16112fd3dca52767c19bb52
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2f1bd34cca459054a035c07dfbf38f53c01552d6dfad51dfd430634d69f7f8c
3
  size 15984
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d4bca0f9e13713e07601b4dc0bd9d3aa19ee7ad3e516a1c659b4636fc2a3b9f2
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88e7d89c7e9ab9880d864078f36a7dcd958bf79763744c29456aa56ddb874c0f
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.9949937421777222,
5
  "eval_steps": 500,
6
- "global_step": 398,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -2793,6 +2793,1399 @@
2793
  "learning_rate": 0.00010205765019574084,
2794
  "loss": 1.3904,
2795
  "step": 398
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2796
  }
2797
  ],
2798
  "logging_steps": 1,
@@ -2812,7 +4205,7 @@
2812
  "attributes": {}
2813
  }
2814
  },
2815
- "total_flos": 1.2530562039493427e+18,
2816
  "train_batch_size": 4,
2817
  "trial_name": null,
2818
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 2.9937421777221527,
5
  "eval_steps": 500,
6
+ "global_step": 597,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
2793
  "learning_rate": 0.00010205765019574084,
2794
  "loss": 1.3904,
2795
  "step": 398
2796
+ },
2797
+ {
2798
+ "epoch": 2.002503128911139,
2799
+ "grad_norm": 9.335354804992676,
2800
+ "learning_rate": 0.00010152883234697336,
2801
+ "loss": 2.9917,
2802
+ "step": 399
2803
+ },
2804
+ {
2805
+ "epoch": 2.0075093867334166,
2806
+ "grad_norm": 3.17944598197937,
2807
+ "learning_rate": 0.000101,
2808
+ "loss": 1.1571,
2809
+ "step": 400
2810
+ },
2811
+ {
2812
+ "epoch": 2.0125156445556946,
2813
+ "grad_norm": 3.515106678009033,
2814
+ "learning_rate": 0.00010047116765302661,
2815
+ "loss": 1.1999,
2816
+ "step": 401
2817
+ },
2818
+ {
2819
+ "epoch": 2.0175219023779727,
2820
+ "grad_norm": 4.571992874145508,
2821
+ "learning_rate": 9.994234980425921e-05,
2822
+ "loss": 1.2175,
2823
+ "step": 402
2824
+ },
2825
+ {
2826
+ "epoch": 2.0225281602002503,
2827
+ "grad_norm": 4.036806106567383,
2828
+ "learning_rate": 9.941356095150613e-05,
2829
+ "loss": 1.114,
2830
+ "step": 403
2831
+ },
2832
+ {
2833
+ "epoch": 2.0275344180225283,
2834
+ "grad_norm": 3.547945499420166,
2835
+ "learning_rate": 9.888481559178096e-05,
2836
+ "loss": 1.2359,
2837
+ "step": 404
2838
+ },
2839
+ {
2840
+ "epoch": 2.032540675844806,
2841
+ "grad_norm": 4.097041130065918,
2842
+ "learning_rate": 9.835612822090483e-05,
2843
+ "loss": 1.2015,
2844
+ "step": 405
2845
+ },
2846
+ {
2847
+ "epoch": 2.037546933667084,
2848
+ "grad_norm": 4.594753742218018,
2849
+ "learning_rate": 9.782751333310905e-05,
2850
+ "loss": 1.082,
2851
+ "step": 406
2852
+ },
2853
+ {
2854
+ "epoch": 2.0425531914893615,
2855
+ "grad_norm": 3.9699299335479736,
2856
+ "learning_rate": 9.72989854206378e-05,
2857
+ "loss": 1.258,
2858
+ "step": 407
2859
+ },
2860
+ {
2861
+ "epoch": 2.0475594493116396,
2862
+ "grad_norm": 4.059878349304199,
2863
+ "learning_rate": 9.677055897335087e-05,
2864
+ "loss": 1.0336,
2865
+ "step": 408
2866
+ },
2867
+ {
2868
+ "epoch": 2.052565707133917,
2869
+ "grad_norm": 3.936467409133911,
2870
+ "learning_rate": 9.62422484783261e-05,
2871
+ "loss": 1.3053,
2872
+ "step": 409
2873
+ },
2874
+ {
2875
+ "epoch": 2.057571964956195,
2876
+ "grad_norm": 3.5442802906036377,
2877
+ "learning_rate": 9.571406841946267e-05,
2878
+ "loss": 1.0357,
2879
+ "step": 410
2880
+ },
2881
+ {
2882
+ "epoch": 2.0625782227784732,
2883
+ "grad_norm": 3.461674213409424,
2884
+ "learning_rate": 9.518603327708372e-05,
2885
+ "loss": 1.0689,
2886
+ "step": 411
2887
+ },
2888
+ {
2889
+ "epoch": 2.067584480600751,
2890
+ "grad_norm": 3.423532247543335,
2891
+ "learning_rate": 9.465815752753935e-05,
2892
+ "loss": 1.0562,
2893
+ "step": 412
2894
+ },
2895
+ {
2896
+ "epoch": 2.072590738423029,
2897
+ "grad_norm": 4.412137508392334,
2898
+ "learning_rate": 9.413045564280998e-05,
2899
+ "loss": 1.2204,
2900
+ "step": 413
2901
+ },
2902
+ {
2903
+ "epoch": 2.0775969962453065,
2904
+ "grad_norm": 3.3397271633148193,
2905
+ "learning_rate": 9.360294209010923e-05,
2906
+ "loss": 0.9651,
2907
+ "step": 414
2908
+ },
2909
+ {
2910
+ "epoch": 2.0826032540675845,
2911
+ "grad_norm": 3.77931547164917,
2912
+ "learning_rate": 9.307563133148767e-05,
2913
+ "loss": 1.2274,
2914
+ "step": 415
2915
+ },
2916
+ {
2917
+ "epoch": 2.0876095118898625,
2918
+ "grad_norm": 3.565459728240967,
2919
+ "learning_rate": 9.254853782343616e-05,
2920
+ "loss": 1.1143,
2921
+ "step": 416
2922
+ },
2923
+ {
2924
+ "epoch": 2.09261576971214,
2925
+ "grad_norm": 3.6844406127929688,
2926
+ "learning_rate": 9.202167601648942e-05,
2927
+ "loss": 1.2619,
2928
+ "step": 417
2929
+ },
2930
+ {
2931
+ "epoch": 2.097622027534418,
2932
+ "grad_norm": 3.3748247623443604,
2933
+ "learning_rate": 9.149506035483005e-05,
2934
+ "loss": 1.0988,
2935
+ "step": 418
2936
+ },
2937
+ {
2938
+ "epoch": 2.1026282853566958,
2939
+ "grad_norm": 3.4693350791931152,
2940
+ "learning_rate": 9.096870527589248e-05,
2941
+ "loss": 1.1261,
2942
+ "step": 419
2943
+ },
2944
+ {
2945
+ "epoch": 2.107634543178974,
2946
+ "grad_norm": 3.5337955951690674,
2947
+ "learning_rate": 9.044262520996702e-05,
2948
+ "loss": 1.0759,
2949
+ "step": 420
2950
+ },
2951
+ {
2952
+ "epoch": 2.1126408010012514,
2953
+ "grad_norm": 3.5257728099823,
2954
+ "learning_rate": 8.991683457980443e-05,
2955
+ "loss": 1.085,
2956
+ "step": 421
2957
+ },
2958
+ {
2959
+ "epoch": 2.1176470588235294,
2960
+ "grad_norm": 3.7548561096191406,
2961
+ "learning_rate": 8.93913478002205e-05,
2962
+ "loss": 1.3311,
2963
+ "step": 422
2964
+ },
2965
+ {
2966
+ "epoch": 2.122653316645807,
2967
+ "grad_norm": 3.253643751144409,
2968
+ "learning_rate": 8.886617927770065e-05,
2969
+ "loss": 1.0647,
2970
+ "step": 423
2971
+ },
2972
+ {
2973
+ "epoch": 2.127659574468085,
2974
+ "grad_norm": 3.59781551361084,
2975
+ "learning_rate": 8.834134341000527e-05,
2976
+ "loss": 1.1489,
2977
+ "step": 424
2978
+ },
2979
+ {
2980
+ "epoch": 2.132665832290363,
2981
+ "grad_norm": 3.869216203689575,
2982
+ "learning_rate": 8.781685458577481e-05,
2983
+ "loss": 1.2613,
2984
+ "step": 425
2985
+ },
2986
+ {
2987
+ "epoch": 2.1376720901126407,
2988
+ "grad_norm": 3.8844690322875977,
2989
+ "learning_rate": 8.729272718413527e-05,
2990
+ "loss": 1.1593,
2991
+ "step": 426
2992
+ },
2993
+ {
2994
+ "epoch": 2.1426783479349187,
2995
+ "grad_norm": 3.599303722381592,
2996
+ "learning_rate": 8.676897557430415e-05,
2997
+ "loss": 1.1382,
2998
+ "step": 427
2999
+ },
3000
+ {
3001
+ "epoch": 2.1476846057571963,
3002
+ "grad_norm": 3.438326120376587,
3003
+ "learning_rate": 8.624561411519644e-05,
3004
+ "loss": 1.0946,
3005
+ "step": 428
3006
+ },
3007
+ {
3008
+ "epoch": 2.1526908635794744,
3009
+ "grad_norm": 3.7482898235321045,
3010
+ "learning_rate": 8.572265715503086e-05,
3011
+ "loss": 1.1727,
3012
+ "step": 429
3013
+ },
3014
+ {
3015
+ "epoch": 2.1576971214017524,
3016
+ "grad_norm": 3.5767128467559814,
3017
+ "learning_rate": 8.520011903093666e-05,
3018
+ "loss": 1.2824,
3019
+ "step": 430
3020
+ },
3021
+ {
3022
+ "epoch": 2.16270337922403,
3023
+ "grad_norm": 3.148529529571533,
3024
+ "learning_rate": 8.467801406856054e-05,
3025
+ "loss": 0.9895,
3026
+ "step": 431
3027
+ },
3028
+ {
3029
+ "epoch": 2.167709637046308,
3030
+ "grad_norm": 3.4491708278656006,
3031
+ "learning_rate": 8.415635658167368e-05,
3032
+ "loss": 1.1678,
3033
+ "step": 432
3034
+ },
3035
+ {
3036
+ "epoch": 2.1727158948685856,
3037
+ "grad_norm": 3.461238384246826,
3038
+ "learning_rate": 8.363516087177962e-05,
3039
+ "loss": 1.1203,
3040
+ "step": 433
3041
+ },
3042
+ {
3043
+ "epoch": 2.1777221526908637,
3044
+ "grad_norm": 3.5150508880615234,
3045
+ "learning_rate": 8.31144412277221e-05,
3046
+ "loss": 1.1886,
3047
+ "step": 434
3048
+ },
3049
+ {
3050
+ "epoch": 2.1827284105131413,
3051
+ "grad_norm": 3.0719783306121826,
3052
+ "learning_rate": 8.25942119252931e-05,
3053
+ "loss": 0.9225,
3054
+ "step": 435
3055
+ },
3056
+ {
3057
+ "epoch": 2.1877346683354193,
3058
+ "grad_norm": 3.7611846923828125,
3059
+ "learning_rate": 8.20744872268418e-05,
3060
+ "loss": 1.2618,
3061
+ "step": 436
3062
+ },
3063
+ {
3064
+ "epoch": 2.192740926157697,
3065
+ "grad_norm": 4.004642963409424,
3066
+ "learning_rate": 8.155528138088337e-05,
3067
+ "loss": 1.2625,
3068
+ "step": 437
3069
+ },
3070
+ {
3071
+ "epoch": 2.197747183979975,
3072
+ "grad_norm": 3.3355510234832764,
3073
+ "learning_rate": 8.103660862170826e-05,
3074
+ "loss": 1.0592,
3075
+ "step": 438
3076
+ },
3077
+ {
3078
+ "epoch": 2.202753441802253,
3079
+ "grad_norm": 3.7413265705108643,
3080
+ "learning_rate": 8.051848316899227e-05,
3081
+ "loss": 1.208,
3082
+ "step": 439
3083
+ },
3084
+ {
3085
+ "epoch": 2.2077596996245306,
3086
+ "grad_norm": 4.234354496002197,
3087
+ "learning_rate": 8.000091922740633e-05,
3088
+ "loss": 1.4002,
3089
+ "step": 440
3090
+ },
3091
+ {
3092
+ "epoch": 2.2127659574468086,
3093
+ "grad_norm": 4.029099464416504,
3094
+ "learning_rate": 7.948393098622737e-05,
3095
+ "loss": 1.1973,
3096
+ "step": 441
3097
+ },
3098
+ {
3099
+ "epoch": 2.217772215269086,
3100
+ "grad_norm": 3.4637832641601562,
3101
+ "learning_rate": 7.896753261894923e-05,
3102
+ "loss": 1.1302,
3103
+ "step": 442
3104
+ },
3105
+ {
3106
+ "epoch": 2.2227784730913642,
3107
+ "grad_norm": 3.666888952255249,
3108
+ "learning_rate": 7.845173828289392e-05,
3109
+ "loss": 1.1006,
3110
+ "step": 443
3111
+ },
3112
+ {
3113
+ "epoch": 2.2277847309136423,
3114
+ "grad_norm": 3.490281581878662,
3115
+ "learning_rate": 7.793656211882377e-05,
3116
+ "loss": 1.095,
3117
+ "step": 444
3118
+ },
3119
+ {
3120
+ "epoch": 2.23279098873592,
3121
+ "grad_norm": 3.6505391597747803,
3122
+ "learning_rate": 7.74220182505536e-05,
3123
+ "loss": 1.2131,
3124
+ "step": 445
3125
+ },
3126
+ {
3127
+ "epoch": 2.237797246558198,
3128
+ "grad_norm": 3.622451066970825,
3129
+ "learning_rate": 7.690812078456336e-05,
3130
+ "loss": 1.238,
3131
+ "step": 446
3132
+ },
3133
+ {
3134
+ "epoch": 2.2428035043804755,
3135
+ "grad_norm": 3.1041157245635986,
3136
+ "learning_rate": 7.639488380961173e-05,
3137
+ "loss": 0.9489,
3138
+ "step": 447
3139
+ },
3140
+ {
3141
+ "epoch": 2.2478097622027535,
3142
+ "grad_norm": 3.3886702060699463,
3143
+ "learning_rate": 7.588232139634968e-05,
3144
+ "loss": 1.1466,
3145
+ "step": 448
3146
+ },
3147
+ {
3148
+ "epoch": 2.252816020025031,
3149
+ "grad_norm": 3.712101697921753,
3150
+ "learning_rate": 7.537044759693463e-05,
3151
+ "loss": 1.0877,
3152
+ "step": 449
3153
+ },
3154
+ {
3155
+ "epoch": 2.257822277847309,
3156
+ "grad_norm": 3.551365613937378,
3157
+ "learning_rate": 7.48592764446454e-05,
3158
+ "loss": 1.1355,
3159
+ "step": 450
3160
+ },
3161
+ {
3162
+ "epoch": 2.2628285356695867,
3163
+ "grad_norm": 3.4715287685394287,
3164
+ "learning_rate": 7.434882195349736e-05,
3165
+ "loss": 1.1936,
3166
+ "step": 451
3167
+ },
3168
+ {
3169
+ "epoch": 2.267834793491865,
3170
+ "grad_norm": 4.415275573730469,
3171
+ "learning_rate": 7.383909811785817e-05,
3172
+ "loss": 1.3533,
3173
+ "step": 452
3174
+ },
3175
+ {
3176
+ "epoch": 2.272841051314143,
3177
+ "grad_norm": 3.295712471008301,
3178
+ "learning_rate": 7.333011891206432e-05,
3179
+ "loss": 1.2051,
3180
+ "step": 453
3181
+ },
3182
+ {
3183
+ "epoch": 2.2778473091364204,
3184
+ "grad_norm": 4.3286213874816895,
3185
+ "learning_rate": 7.282189829003785e-05,
3186
+ "loss": 1.246,
3187
+ "step": 454
3188
+ },
3189
+ {
3190
+ "epoch": 2.2828535669586985,
3191
+ "grad_norm": 3.628694772720337,
3192
+ "learning_rate": 7.231445018490381e-05,
3193
+ "loss": 1.2319,
3194
+ "step": 455
3195
+ },
3196
+ {
3197
+ "epoch": 2.287859824780976,
3198
+ "grad_norm": 3.730273723602295,
3199
+ "learning_rate": 7.180778850860835e-05,
3200
+ "loss": 1.0754,
3201
+ "step": 456
3202
+ },
3203
+ {
3204
+ "epoch": 2.292866082603254,
3205
+ "grad_norm": 3.6538302898406982,
3206
+ "learning_rate": 7.130192715153731e-05,
3207
+ "loss": 1.2115,
3208
+ "step": 457
3209
+ },
3210
+ {
3211
+ "epoch": 2.297872340425532,
3212
+ "grad_norm": 3.4317686557769775,
3213
+ "learning_rate": 7.079687998213526e-05,
3214
+ "loss": 1.2313,
3215
+ "step": 458
3216
+ },
3217
+ {
3218
+ "epoch": 2.3028785982478097,
3219
+ "grad_norm": 3.6949033737182617,
3220
+ "learning_rate": 7.029266084652548e-05,
3221
+ "loss": 1.1821,
3222
+ "step": 459
3223
+ },
3224
+ {
3225
+ "epoch": 2.3078848560700878,
3226
+ "grad_norm": 3.3573741912841797,
3227
+ "learning_rate": 6.978928356813031e-05,
3228
+ "loss": 1.1804,
3229
+ "step": 460
3230
+ },
3231
+ {
3232
+ "epoch": 2.3128911138923653,
3233
+ "grad_norm": 3.4236843585968018,
3234
+ "learning_rate": 6.92867619472921e-05,
3235
+ "loss": 1.1519,
3236
+ "step": 461
3237
+ },
3238
+ {
3239
+ "epoch": 2.3178973717146434,
3240
+ "grad_norm": 3.1184580326080322,
3241
+ "learning_rate": 6.878510976089493e-05,
3242
+ "loss": 1.0577,
3243
+ "step": 462
3244
+ },
3245
+ {
3246
+ "epoch": 2.322903629536921,
3247
+ "grad_norm": 3.2034711837768555,
3248
+ "learning_rate": 6.828434076198693e-05,
3249
+ "loss": 1.1435,
3250
+ "step": 463
3251
+ },
3252
+ {
3253
+ "epoch": 2.327909887359199,
3254
+ "grad_norm": 3.5450615882873535,
3255
+ "learning_rate": 6.77844686794031e-05,
3256
+ "loss": 1.2075,
3257
+ "step": 464
3258
+ },
3259
+ {
3260
+ "epoch": 2.3329161451814766,
3261
+ "grad_norm": 3.3356549739837646,
3262
+ "learning_rate": 6.728550721738915e-05,
3263
+ "loss": 1.1745,
3264
+ "step": 465
3265
+ },
3266
+ {
3267
+ "epoch": 2.3379224030037546,
3268
+ "grad_norm": 3.2924506664276123,
3269
+ "learning_rate": 6.678747005522557e-05,
3270
+ "loss": 1.0487,
3271
+ "step": 466
3272
+ },
3273
+ {
3274
+ "epoch": 2.3429286608260327,
3275
+ "grad_norm": 3.386472225189209,
3276
+ "learning_rate": 6.629037084685278e-05,
3277
+ "loss": 1.1496,
3278
+ "step": 467
3279
+ },
3280
+ {
3281
+ "epoch": 2.3479349186483103,
3282
+ "grad_norm": 3.7050280570983887,
3283
+ "learning_rate": 6.579422322049668e-05,
3284
+ "loss": 1.3198,
3285
+ "step": 468
3286
+ },
3287
+ {
3288
+ "epoch": 2.3529411764705883,
3289
+ "grad_norm": 3.130352258682251,
3290
+ "learning_rate": 6.529904077829505e-05,
3291
+ "loss": 1.0512,
3292
+ "step": 469
3293
+ },
3294
+ {
3295
+ "epoch": 2.357947434292866,
3296
+ "grad_norm": 3.446882963180542,
3297
+ "learning_rate": 6.480483709592468e-05,
3298
+ "loss": 1.2949,
3299
+ "step": 470
3300
+ },
3301
+ {
3302
+ "epoch": 2.362953692115144,
3303
+ "grad_norm": 3.6783792972564697,
3304
+ "learning_rate": 6.43116257222292e-05,
3305
+ "loss": 1.1859,
3306
+ "step": 471
3307
+ },
3308
+ {
3309
+ "epoch": 2.367959949937422,
3310
+ "grad_norm": 3.3411688804626465,
3311
+ "learning_rate": 6.381942017884753e-05,
3312
+ "loss": 1.1589,
3313
+ "step": 472
3314
+ },
3315
+ {
3316
+ "epoch": 2.3729662077596996,
3317
+ "grad_norm": 3.6124532222747803,
3318
+ "learning_rate": 6.33282339598433e-05,
3319
+ "loss": 1.0375,
3320
+ "step": 473
3321
+ },
3322
+ {
3323
+ "epoch": 2.3779724655819776,
3324
+ "grad_norm": 3.412095785140991,
3325
+ "learning_rate": 6.283808053133484e-05,
3326
+ "loss": 1.0928,
3327
+ "step": 474
3328
+ },
3329
+ {
3330
+ "epoch": 2.382978723404255,
3331
+ "grad_norm": 4.005087852478027,
3332
+ "learning_rate": 6.234897333112594e-05,
3333
+ "loss": 1.3221,
3334
+ "step": 475
3335
+ },
3336
+ {
3337
+ "epoch": 2.3879849812265332,
3338
+ "grad_norm": 2.9916348457336426,
3339
+ "learning_rate": 6.186092576833761e-05,
3340
+ "loss": 1.028,
3341
+ "step": 476
3342
+ },
3343
+ {
3344
+ "epoch": 2.392991239048811,
3345
+ "grad_norm": 3.539137601852417,
3346
+ "learning_rate": 6.137395122304033e-05,
3347
+ "loss": 1.3132,
3348
+ "step": 477
3349
+ },
3350
+ {
3351
+ "epoch": 2.397997496871089,
3352
+ "grad_norm": 3.288222312927246,
3353
+ "learning_rate": 6.088806304588717e-05,
3354
+ "loss": 1.1697,
3355
+ "step": 478
3356
+ },
3357
+ {
3358
+ "epoch": 2.4030037546933665,
3359
+ "grad_norm": 2.9620723724365234,
3360
+ "learning_rate": 6.0403274557748035e-05,
3361
+ "loss": 0.9548,
3362
+ "step": 479
3363
+ },
3364
+ {
3365
+ "epoch": 2.4080100125156445,
3366
+ "grad_norm": 3.1602985858917236,
3367
+ "learning_rate": 5.9919599049344194e-05,
3368
+ "loss": 1.0504,
3369
+ "step": 480
3370
+ },
3371
+ {
3372
+ "epoch": 2.4130162703379225,
3373
+ "grad_norm": 3.3587441444396973,
3374
+ "learning_rate": 5.943704978088402e-05,
3375
+ "loss": 1.072,
3376
+ "step": 481
3377
+ },
3378
+ {
3379
+ "epoch": 2.4180225281602,
3380
+ "grad_norm": 3.729177951812744,
3381
+ "learning_rate": 5.89556399816995e-05,
3382
+ "loss": 1.2125,
3383
+ "step": 482
3384
+ },
3385
+ {
3386
+ "epoch": 2.423028785982478,
3387
+ "grad_norm": 3.408430576324463,
3388
+ "learning_rate": 5.847538284988341e-05,
3389
+ "loss": 1.1585,
3390
+ "step": 483
3391
+ },
3392
+ {
3393
+ "epoch": 2.4280350438047558,
3394
+ "grad_norm": 3.401205062866211,
3395
+ "learning_rate": 5.7996291551927666e-05,
3396
+ "loss": 1.1512,
3397
+ "step": 484
3398
+ },
3399
+ {
3400
+ "epoch": 2.433041301627034,
3401
+ "grad_norm": 3.5880188941955566,
3402
+ "learning_rate": 5.751837922236217e-05,
3403
+ "loss": 1.2643,
3404
+ "step": 485
3405
+ },
3406
+ {
3407
+ "epoch": 2.438047559449312,
3408
+ "grad_norm": 3.426093578338623,
3409
+ "learning_rate": 5.704165896339494e-05,
3410
+ "loss": 1.2169,
3411
+ "step": 486
3412
+ },
3413
+ {
3414
+ "epoch": 2.4430538172715894,
3415
+ "grad_norm": 3.17655611038208,
3416
+ "learning_rate": 5.656614384455257e-05,
3417
+ "loss": 1.0476,
3418
+ "step": 487
3419
+ },
3420
+ {
3421
+ "epoch": 2.4480600750938675,
3422
+ "grad_norm": 3.2203094959259033,
3423
+ "learning_rate": 5.609184690232235e-05,
3424
+ "loss": 1.0628,
3425
+ "step": 488
3426
+ },
3427
+ {
3428
+ "epoch": 2.453066332916145,
3429
+ "grad_norm": 3.196530342102051,
3430
+ "learning_rate": 5.5618781139794465e-05,
3431
+ "loss": 1.0765,
3432
+ "step": 489
3433
+ },
3434
+ {
3435
+ "epoch": 2.458072590738423,
3436
+ "grad_norm": 3.410573959350586,
3437
+ "learning_rate": 5.514695952630578e-05,
3438
+ "loss": 0.9914,
3439
+ "step": 490
3440
+ },
3441
+ {
3442
+ "epoch": 2.4630788485607007,
3443
+ "grad_norm": 3.6145739555358887,
3444
+ "learning_rate": 5.467639499708423e-05,
3445
+ "loss": 1.111,
3446
+ "step": 491
3447
+ },
3448
+ {
3449
+ "epoch": 2.4680851063829787,
3450
+ "grad_norm": 3.244371175765991,
3451
+ "learning_rate": 5.420710045289399e-05,
3452
+ "loss": 1.0385,
3453
+ "step": 492
3454
+ },
3455
+ {
3456
+ "epoch": 2.4730913642052563,
3457
+ "grad_norm": 3.114959239959717,
3458
+ "learning_rate": 5.373908875968211e-05,
3459
+ "loss": 1.1609,
3460
+ "step": 493
3461
+ },
3462
+ {
3463
+ "epoch": 2.4780976220275344,
3464
+ "grad_norm": 3.3504631519317627,
3465
+ "learning_rate": 5.3272372748225556e-05,
3466
+ "loss": 1.2219,
3467
+ "step": 494
3468
+ },
3469
+ {
3470
+ "epoch": 2.4831038798498124,
3471
+ "grad_norm": 3.6841323375701904,
3472
+ "learning_rate": 5.2806965213779544e-05,
3473
+ "loss": 1.2159,
3474
+ "step": 495
3475
+ },
3476
+ {
3477
+ "epoch": 2.48811013767209,
3478
+ "grad_norm": 3.017163038253784,
3479
+ "learning_rate": 5.234287891572674e-05,
3480
+ "loss": 0.9543,
3481
+ "step": 496
3482
+ },
3483
+ {
3484
+ "epoch": 2.493116395494368,
3485
+ "grad_norm": 3.568748950958252,
3486
+ "learning_rate": 5.1880126577227464e-05,
3487
+ "loss": 1.3312,
3488
+ "step": 497
3489
+ },
3490
+ {
3491
+ "epoch": 2.4981226533166456,
3492
+ "grad_norm": 2.6787469387054443,
3493
+ "learning_rate": 5.141872088487078e-05,
3494
+ "loss": 0.9538,
3495
+ "step": 498
3496
+ },
3497
+ {
3498
+ "epoch": 2.5031289111389237,
3499
+ "grad_norm": 3.8604955673217773,
3500
+ "learning_rate": 5.095867448832683e-05,
3501
+ "loss": 1.3486,
3502
+ "step": 499
3503
+ },
3504
+ {
3505
+ "epoch": 2.5081351689612017,
3506
+ "grad_norm": 3.5043580532073975,
3507
+ "learning_rate": 5.050000000000002e-05,
3508
+ "loss": 1.1804,
3509
+ "step": 500
3510
+ },
3511
+ {
3512
+ "epoch": 2.5131414267834793,
3513
+ "grad_norm": 3.233349323272705,
3514
+ "learning_rate": 5.004270999468307e-05,
3515
+ "loss": 1.1243,
3516
+ "step": 501
3517
+ },
3518
+ {
3519
+ "epoch": 2.5181476846057573,
3520
+ "grad_norm": 3.219106912612915,
3521
+ "learning_rate": 4.95868170092125e-05,
3522
+ "loss": 1.3048,
3523
+ "step": 502
3524
+ },
3525
+ {
3526
+ "epoch": 2.523153942428035,
3527
+ "grad_norm": 3.1391568183898926,
3528
+ "learning_rate": 4.913233354212485e-05,
3529
+ "loss": 1.1749,
3530
+ "step": 503
3531
+ },
3532
+ {
3533
+ "epoch": 2.528160200250313,
3534
+ "grad_norm": 2.795653820037842,
3535
+ "learning_rate": 4.867927205331386e-05,
3536
+ "loss": 0.9872,
3537
+ "step": 504
3538
+ },
3539
+ {
3540
+ "epoch": 2.533166458072591,
3541
+ "grad_norm": 3.4815118312835693,
3542
+ "learning_rate": 4.822764496368917e-05,
3543
+ "loss": 1.2715,
3544
+ "step": 505
3545
+ },
3546
+ {
3547
+ "epoch": 2.5381727158948686,
3548
+ "grad_norm": 3.4137794971466064,
3549
+ "learning_rate": 4.7777464654835564e-05,
3550
+ "loss": 1.1253,
3551
+ "step": 506
3552
+ },
3553
+ {
3554
+ "epoch": 2.543178973717146,
3555
+ "grad_norm": 3.228795289993286,
3556
+ "learning_rate": 4.732874346867362e-05,
3557
+ "loss": 1.111,
3558
+ "step": 507
3559
+ },
3560
+ {
3561
+ "epoch": 2.5481852315394242,
3562
+ "grad_norm": 2.9751367568969727,
3563
+ "learning_rate": 4.6881493707121315e-05,
3564
+ "loss": 0.9749,
3565
+ "step": 508
3566
+ },
3567
+ {
3568
+ "epoch": 2.5531914893617023,
3569
+ "grad_norm": 4.011078834533691,
3570
+ "learning_rate": 4.643572763175684e-05,
3571
+ "loss": 1.1068,
3572
+ "step": 509
3573
+ },
3574
+ {
3575
+ "epoch": 2.55819774718398,
3576
+ "grad_norm": 3.416471481323242,
3577
+ "learning_rate": 4.5991457463482264e-05,
3578
+ "loss": 1.1882,
3579
+ "step": 510
3580
+ },
3581
+ {
3582
+ "epoch": 2.563204005006258,
3583
+ "grad_norm": 2.948613166809082,
3584
+ "learning_rate": 4.554869538218868e-05,
3585
+ "loss": 0.9455,
3586
+ "step": 511
3587
+ },
3588
+ {
3589
+ "epoch": 2.5682102628285355,
3590
+ "grad_norm": 3.4453470706939697,
3591
+ "learning_rate": 4.5107453526422255e-05,
3592
+ "loss": 1.3593,
3593
+ "step": 512
3594
+ },
3595
+ {
3596
+ "epoch": 2.5732165206508135,
3597
+ "grad_norm": 3.316791296005249,
3598
+ "learning_rate": 4.46677439930513e-05,
3599
+ "loss": 1.1298,
3600
+ "step": 513
3601
+ },
3602
+ {
3603
+ "epoch": 2.5782227784730916,
3604
+ "grad_norm": 2.7384519577026367,
3605
+ "learning_rate": 4.422957883693483e-05,
3606
+ "loss": 0.9876,
3607
+ "step": 514
3608
+ },
3609
+ {
3610
+ "epoch": 2.583229036295369,
3611
+ "grad_norm": 3.192760705947876,
3612
+ "learning_rate": 4.3792970070591906e-05,
3613
+ "loss": 1.0523,
3614
+ "step": 515
3615
+ },
3616
+ {
3617
+ "epoch": 2.588235294117647,
3618
+ "grad_norm": 3.4807369709014893,
3619
+ "learning_rate": 4.3357929663872406e-05,
3620
+ "loss": 1.0323,
3621
+ "step": 516
3622
+ },
3623
+ {
3624
+ "epoch": 2.593241551939925,
3625
+ "grad_norm": 3.2211036682128906,
3626
+ "learning_rate": 4.29244695436289e-05,
3627
+ "loss": 0.9843,
3628
+ "step": 517
3629
+ },
3630
+ {
3631
+ "epoch": 2.598247809762203,
3632
+ "grad_norm": 3.5839602947235107,
3633
+ "learning_rate": 4.249260159338946e-05,
3634
+ "loss": 1.1994,
3635
+ "step": 518
3636
+ },
3637
+ {
3638
+ "epoch": 2.603254067584481,
3639
+ "grad_norm": 3.289240837097168,
3640
+ "learning_rate": 4.2062337653032146e-05,
3641
+ "loss": 1.084,
3642
+ "step": 519
3643
+ },
3644
+ {
3645
+ "epoch": 2.6082603254067585,
3646
+ "grad_norm": 3.4030802249908447,
3647
+ "learning_rate": 4.1633689518460225e-05,
3648
+ "loss": 1.2044,
3649
+ "step": 520
3650
+ },
3651
+ {
3652
+ "epoch": 2.613266583229036,
3653
+ "grad_norm": 3.1492092609405518,
3654
+ "learning_rate": 4.1206668941278826e-05,
3655
+ "loss": 1.0259,
3656
+ "step": 521
3657
+ },
3658
+ {
3659
+ "epoch": 2.618272841051314,
3660
+ "grad_norm": 3.439842939376831,
3661
+ "learning_rate": 4.078128762847279e-05,
3662
+ "loss": 1.2384,
3663
+ "step": 522
3664
+ },
3665
+ {
3666
+ "epoch": 2.623279098873592,
3667
+ "grad_norm": 3.105792284011841,
3668
+ "learning_rate": 4.035755724208573e-05,
3669
+ "loss": 1.0454,
3670
+ "step": 523
3671
+ },
3672
+ {
3673
+ "epoch": 2.6282853566958697,
3674
+ "grad_norm": 3.2097530364990234,
3675
+ "learning_rate": 3.9935489398900145e-05,
3676
+ "loss": 1.1008,
3677
+ "step": 524
3678
+ },
3679
+ {
3680
+ "epoch": 2.6332916145181477,
3681
+ "grad_norm": 2.9623773097991943,
3682
+ "learning_rate": 3.951509567011922e-05,
3683
+ "loss": 0.9429,
3684
+ "step": 525
3685
+ },
3686
+ {
3687
+ "epoch": 2.6382978723404253,
3688
+ "grad_norm": 3.3015732765197754,
3689
+ "learning_rate": 3.90963875810494e-05,
3690
+ "loss": 1.1089,
3691
+ "step": 526
3692
+ },
3693
+ {
3694
+ "epoch": 2.6433041301627034,
3695
+ "grad_norm": 3.1678755283355713,
3696
+ "learning_rate": 3.86793766107844e-05,
3697
+ "loss": 1.0696,
3698
+ "step": 527
3699
+ },
3700
+ {
3701
+ "epoch": 2.6483103879849814,
3702
+ "grad_norm": 2.9453158378601074,
3703
+ "learning_rate": 3.826407419189066e-05,
3704
+ "loss": 0.9968,
3705
+ "step": 528
3706
+ },
3707
+ {
3708
+ "epoch": 2.653316645807259,
3709
+ "grad_norm": 2.9824790954589844,
3710
+ "learning_rate": 3.785049171009381e-05,
3711
+ "loss": 1.103,
3712
+ "step": 529
3713
+ },
3714
+ {
3715
+ "epoch": 2.658322903629537,
3716
+ "grad_norm": 3.202838659286499,
3717
+ "learning_rate": 3.743864050396644e-05,
3718
+ "loss": 1.1283,
3719
+ "step": 530
3720
+ },
3721
+ {
3722
+ "epoch": 2.6633291614518146,
3723
+ "grad_norm": 3.488142728805542,
3724
+ "learning_rate": 3.7028531864617444e-05,
3725
+ "loss": 1.1765,
3726
+ "step": 531
3727
+ },
3728
+ {
3729
+ "epoch": 2.6683354192740927,
3730
+ "grad_norm": 3.3499858379364014,
3731
+ "learning_rate": 3.662017703538234e-05,
3732
+ "loss": 1.2133,
3733
+ "step": 532
3734
+ },
3735
+ {
3736
+ "epoch": 2.6733416770963707,
3737
+ "grad_norm": 3.035024881362915,
3738
+ "learning_rate": 3.621358721151505e-05,
3739
+ "loss": 1.1001,
3740
+ "step": 533
3741
+ },
3742
+ {
3743
+ "epoch": 2.6783479349186483,
3744
+ "grad_norm": 3.237748861312866,
3745
+ "learning_rate": 3.5808773539880973e-05,
3746
+ "loss": 1.1989,
3747
+ "step": 534
3748
+ },
3749
+ {
3750
+ "epoch": 2.683354192740926,
3751
+ "grad_norm": 3.3954966068267822,
3752
+ "learning_rate": 3.540574711865146e-05,
3753
+ "loss": 1.202,
3754
+ "step": 535
3755
+ },
3756
+ {
3757
+ "epoch": 2.688360450563204,
3758
+ "grad_norm": 3.1598644256591797,
3759
+ "learning_rate": 3.500451899699935e-05,
3760
+ "loss": 1.1158,
3761
+ "step": 536
3762
+ },
3763
+ {
3764
+ "epoch": 2.693366708385482,
3765
+ "grad_norm": 3.440000295639038,
3766
+ "learning_rate": 3.460510017479631e-05,
3767
+ "loss": 1.2114,
3768
+ "step": 537
3769
+ },
3770
+ {
3771
+ "epoch": 2.6983729662077596,
3772
+ "grad_norm": 2.966639518737793,
3773
+ "learning_rate": 3.420750160231118e-05,
3774
+ "loss": 0.9118,
3775
+ "step": 538
3776
+ },
3777
+ {
3778
+ "epoch": 2.7033792240300376,
3779
+ "grad_norm": 3.380932092666626,
3780
+ "learning_rate": 3.381173417990957e-05,
3781
+ "loss": 1.0835,
3782
+ "step": 539
3783
+ },
3784
+ {
3785
+ "epoch": 2.708385481852315,
3786
+ "grad_norm": 3.229613780975342,
3787
+ "learning_rate": 3.3417808757755355e-05,
3788
+ "loss": 1.138,
3789
+ "step": 540
3790
+ },
3791
+ {
3792
+ "epoch": 2.7133917396745932,
3793
+ "grad_norm": 3.15791916847229,
3794
+ "learning_rate": 3.302573613551292e-05,
3795
+ "loss": 1.2285,
3796
+ "step": 541
3797
+ },
3798
+ {
3799
+ "epoch": 2.7183979974968713,
3800
+ "grad_norm": 2.8461785316467285,
3801
+ "learning_rate": 3.263552706205128e-05,
3802
+ "loss": 0.9287,
3803
+ "step": 542
3804
+ },
3805
+ {
3806
+ "epoch": 2.723404255319149,
3807
+ "grad_norm": 2.8767101764678955,
3808
+ "learning_rate": 3.22471922351493e-05,
3809
+ "loss": 1.0412,
3810
+ "step": 543
3811
+ },
3812
+ {
3813
+ "epoch": 2.728410513141427,
3814
+ "grad_norm": 2.858546018600464,
3815
+ "learning_rate": 3.186074230120244e-05,
3816
+ "loss": 1.0807,
3817
+ "step": 544
3818
+ },
3819
+ {
3820
+ "epoch": 2.7334167709637045,
3821
+ "grad_norm": 2.865001678466797,
3822
+ "learning_rate": 3.147618785493083e-05,
3823
+ "loss": 1.0954,
3824
+ "step": 545
3825
+ },
3826
+ {
3827
+ "epoch": 2.7384230287859825,
3828
+ "grad_norm": 3.1909632682800293,
3829
+ "learning_rate": 3.109353943908893e-05,
3830
+ "loss": 1.1101,
3831
+ "step": 546
3832
+ },
3833
+ {
3834
+ "epoch": 2.7434292866082606,
3835
+ "grad_norm": 3.1679904460906982,
3836
+ "learning_rate": 3.071280754417626e-05,
3837
+ "loss": 1.031,
3838
+ "step": 547
3839
+ },
3840
+ {
3841
+ "epoch": 2.748435544430538,
3842
+ "grad_norm": 3.0772807598114014,
3843
+ "learning_rate": 3.033400260815008e-05,
3844
+ "loss": 1.1844,
3845
+ "step": 548
3846
+ },
3847
+ {
3848
+ "epoch": 2.7534418022528158,
3849
+ "grad_norm": 3.3308348655700684,
3850
+ "learning_rate": 2.9957135016139122e-05,
3851
+ "loss": 1.3589,
3852
+ "step": 549
3853
+ },
3854
+ {
3855
+ "epoch": 2.758448060075094,
3856
+ "grad_norm": 3.0506107807159424,
3857
+ "learning_rate": 2.9582215100158706e-05,
3858
+ "loss": 1.1545,
3859
+ "step": 550
3860
+ },
3861
+ {
3862
+ "epoch": 2.763454317897372,
3863
+ "grad_norm": 3.096111536026001,
3864
+ "learning_rate": 2.920925313882776e-05,
3865
+ "loss": 1.1531,
3866
+ "step": 551
3867
+ },
3868
+ {
3869
+ "epoch": 2.7684605757196494,
3870
+ "grad_norm": 2.8558101654052734,
3871
+ "learning_rate": 2.8838259357086884e-05,
3872
+ "loss": 0.975,
3873
+ "step": 552
3874
+ },
3875
+ {
3876
+ "epoch": 2.7734668335419275,
3877
+ "grad_norm": 3.4529948234558105,
3878
+ "learning_rate": 2.846924392591794e-05,
3879
+ "loss": 1.2278,
3880
+ "step": 553
3881
+ },
3882
+ {
3883
+ "epoch": 2.778473091364205,
3884
+ "grad_norm": 2.830240249633789,
3885
+ "learning_rate": 2.8102216962065423e-05,
3886
+ "loss": 0.9457,
3887
+ "step": 554
3888
+ },
3889
+ {
3890
+ "epoch": 2.783479349186483,
3891
+ "grad_norm": 2.8564674854278564,
3892
+ "learning_rate": 2.7737188527758972e-05,
3893
+ "loss": 0.9811,
3894
+ "step": 555
3895
+ },
3896
+ {
3897
+ "epoch": 2.788485607008761,
3898
+ "grad_norm": 3.3456573486328125,
3899
+ "learning_rate": 2.7374168630437456e-05,
3900
+ "loss": 1.1045,
3901
+ "step": 556
3902
+ },
3903
+ {
3904
+ "epoch": 2.7934918648310387,
3905
+ "grad_norm": 2.9954707622528076,
3906
+ "learning_rate": 2.7013167222474756e-05,
3907
+ "loss": 1.1134,
3908
+ "step": 557
3909
+ },
3910
+ {
3911
+ "epoch": 2.7984981226533168,
3912
+ "grad_norm": 2.952683448791504,
3913
+ "learning_rate": 2.6654194200906833e-05,
3914
+ "loss": 1.0336,
3915
+ "step": 558
3916
+ },
3917
+ {
3918
+ "epoch": 2.8035043804755944,
3919
+ "grad_norm": 3.3193068504333496,
3920
+ "learning_rate": 2.629725940716041e-05,
3921
+ "loss": 1.1333,
3922
+ "step": 559
3923
+ },
3924
+ {
3925
+ "epoch": 2.8085106382978724,
3926
+ "grad_norm": 3.06868577003479,
3927
+ "learning_rate": 2.5942372626783172e-05,
3928
+ "loss": 1.0866,
3929
+ "step": 560
3930
+ },
3931
+ {
3932
+ "epoch": 2.8135168961201504,
3933
+ "grad_norm": 2.9460933208465576,
3934
+ "learning_rate": 2.5589543589175485e-05,
3935
+ "loss": 1.0895,
3936
+ "step": 561
3937
+ },
3938
+ {
3939
+ "epoch": 2.818523153942428,
3940
+ "grad_norm": 3.093489170074463,
3941
+ "learning_rate": 2.523878196732358e-05,
3942
+ "loss": 1.1445,
3943
+ "step": 562
3944
+ },
3945
+ {
3946
+ "epoch": 2.8235294117647056,
3947
+ "grad_norm": 3.155773162841797,
3948
+ "learning_rate": 2.489009737753459e-05,
3949
+ "loss": 1.1296,
3950
+ "step": 563
3951
+ },
3952
+ {
3953
+ "epoch": 2.8285356695869837,
3954
+ "grad_norm": 2.886901378631592,
3955
+ "learning_rate": 2.4543499379172615e-05,
3956
+ "loss": 1.0545,
3957
+ "step": 564
3958
+ },
3959
+ {
3960
+ "epoch": 2.8335419274092617,
3961
+ "grad_norm": 3.0896921157836914,
3962
+ "learning_rate": 2.4198997474396877e-05,
3963
+ "loss": 1.0446,
3964
+ "step": 565
3965
+ },
3966
+ {
3967
+ "epoch": 2.8385481852315393,
3968
+ "grad_norm": 2.9552993774414062,
3969
+ "learning_rate": 2.3856601107901166e-05,
3970
+ "loss": 0.8962,
3971
+ "step": 566
3972
+ },
3973
+ {
3974
+ "epoch": 2.8435544430538173,
3975
+ "grad_norm": 2.9749550819396973,
3976
+ "learning_rate": 2.351631966665476e-05,
3977
+ "loss": 1.0768,
3978
+ "step": 567
3979
+ },
3980
+ {
3981
+ "epoch": 2.848560700876095,
3982
+ "grad_norm": 3.1430954933166504,
3983
+ "learning_rate": 2.31781624796453e-05,
3984
+ "loss": 1.1391,
3985
+ "step": 568
3986
+ },
3987
+ {
3988
+ "epoch": 2.853566958698373,
3989
+ "grad_norm": 3.2870116233825684,
3990
+ "learning_rate": 2.2842138817622883e-05,
3991
+ "loss": 0.985,
3992
+ "step": 569
3993
+ },
3994
+ {
3995
+ "epoch": 2.858573216520651,
3996
+ "grad_norm": 2.746265172958374,
3997
+ "learning_rate": 2.250825789284594e-05,
3998
+ "loss": 0.9044,
3999
+ "step": 570
4000
+ },
4001
+ {
4002
+ "epoch": 2.8635794743429286,
4003
+ "grad_norm": 3.157536506652832,
4004
+ "learning_rate": 2.217652885882869e-05,
4005
+ "loss": 1.0997,
4006
+ "step": 571
4007
+ },
4008
+ {
4009
+ "epoch": 2.8685857321652066,
4010
+ "grad_norm": 3.519000291824341,
4011
+ "learning_rate": 2.1846960810090188e-05,
4012
+ "loss": 1.2928,
4013
+ "step": 572
4014
+ },
4015
+ {
4016
+ "epoch": 2.873591989987484,
4017
+ "grad_norm": 3.1560218334198,
4018
+ "learning_rate": 2.151956278190494e-05,
4019
+ "loss": 1.136,
4020
+ "step": 573
4021
+ },
4022
+ {
4023
+ "epoch": 2.8785982478097623,
4024
+ "grad_norm": 3.072340965270996,
4025
+ "learning_rate": 2.119434375005527e-05,
4026
+ "loss": 1.087,
4027
+ "step": 574
4028
+ },
4029
+ {
4030
+ "epoch": 2.8836045056320403,
4031
+ "grad_norm": 2.9986274242401123,
4032
+ "learning_rate": 2.087131263058526e-05,
4033
+ "loss": 0.9521,
4034
+ "step": 575
4035
+ },
4036
+ {
4037
+ "epoch": 2.888610763454318,
4038
+ "grad_norm": 3.377012014389038,
4039
+ "learning_rate": 2.055047827955618e-05,
4040
+ "loss": 1.205,
4041
+ "step": 576
4042
+ },
4043
+ {
4044
+ "epoch": 2.8936170212765955,
4045
+ "grad_norm": 3.326692819595337,
4046
+ "learning_rate": 2.0231849492803852e-05,
4047
+ "loss": 1.1575,
4048
+ "step": 577
4049
+ },
4050
+ {
4051
+ "epoch": 2.8986232790988735,
4052
+ "grad_norm": 2.875267744064331,
4053
+ "learning_rate": 1.991543500569745e-05,
4054
+ "loss": 0.9717,
4055
+ "step": 578
4056
+ },
4057
+ {
4058
+ "epoch": 2.9036295369211516,
4059
+ "grad_norm": 3.402390956878662,
4060
+ "learning_rate": 1.960124349289992e-05,
4061
+ "loss": 1.3638,
4062
+ "step": 579
4063
+ },
4064
+ {
4065
+ "epoch": 2.908635794743429,
4066
+ "grad_norm": 3.127502202987671,
4067
+ "learning_rate": 1.928928356813032e-05,
4068
+ "loss": 1.1322,
4069
+ "step": 580
4070
+ },
4071
+ {
4072
+ "epoch": 2.913642052565707,
4073
+ "grad_norm": 3.1292340755462646,
4074
+ "learning_rate": 1.8979563783927565e-05,
4075
+ "loss": 1.1917,
4076
+ "step": 581
4077
+ },
4078
+ {
4079
+ "epoch": 2.918648310387985,
4080
+ "grad_norm": 2.690685510635376,
4081
+ "learning_rate": 1.8672092631416013e-05,
4082
+ "loss": 0.845,
4083
+ "step": 582
4084
+ },
4085
+ {
4086
+ "epoch": 2.923654568210263,
4087
+ "grad_norm": 3.146888256072998,
4088
+ "learning_rate": 1.8366878540072614e-05,
4089
+ "loss": 1.1461,
4090
+ "step": 583
4091
+ },
4092
+ {
4093
+ "epoch": 2.928660826032541,
4094
+ "grad_norm": 3.0849928855895996,
4095
+ "learning_rate": 1.8063929877495892e-05,
4096
+ "loss": 1.0927,
4097
+ "step": 584
4098
+ },
4099
+ {
4100
+ "epoch": 2.9336670838548184,
4101
+ "grad_norm": 2.757366180419922,
4102
+ "learning_rate": 1.7763254949176414e-05,
4103
+ "loss": 0.903,
4104
+ "step": 585
4105
+ },
4106
+ {
4107
+ "epoch": 2.9386733416770965,
4108
+ "grad_norm": 3.00736665725708,
4109
+ "learning_rate": 1.7464861998269243e-05,
4110
+ "loss": 1.0627,
4111
+ "step": 586
4112
+ },
4113
+ {
4114
+ "epoch": 2.943679599499374,
4115
+ "grad_norm": 2.767845630645752,
4116
+ "learning_rate": 1.7168759205367893e-05,
4117
+ "loss": 1.0366,
4118
+ "step": 587
4119
+ },
4120
+ {
4121
+ "epoch": 2.948685857321652,
4122
+ "grad_norm": 3.163112163543701,
4123
+ "learning_rate": 1.6874954688279956e-05,
4124
+ "loss": 1.1653,
4125
+ "step": 588
4126
+ },
4127
+ {
4128
+ "epoch": 2.95369211514393,
4129
+ "grad_norm": 3.0217740535736084,
4130
+ "learning_rate": 1.6583456501804725e-05,
4131
+ "loss": 1.1163,
4132
+ "step": 589
4133
+ },
4134
+ {
4135
+ "epoch": 2.9586983729662077,
4136
+ "grad_norm": 2.995216131210327,
4137
+ "learning_rate": 1.6294272637512183e-05,
4138
+ "loss": 0.9859,
4139
+ "step": 590
4140
+ },
4141
+ {
4142
+ "epoch": 2.9637046307884853,
4143
+ "grad_norm": 3.1709847450256348,
4144
+ "learning_rate": 1.600741102352409e-05,
4145
+ "loss": 1.0269,
4146
+ "step": 591
4147
+ },
4148
+ {
4149
+ "epoch": 2.9687108886107634,
4150
+ "grad_norm": 3.2043511867523193,
4151
+ "learning_rate": 1.57228795242965e-05,
4152
+ "loss": 1.1052,
4153
+ "step": 592
4154
+ },
4155
+ {
4156
+ "epoch": 2.9737171464330414,
4157
+ "grad_norm": 2.9635226726531982,
4158
+ "learning_rate": 1.544068594040417e-05,
4159
+ "loss": 0.8923,
4160
+ "step": 593
4161
+ },
4162
+ {
4163
+ "epoch": 2.978723404255319,
4164
+ "grad_norm": 3.1605000495910645,
4165
+ "learning_rate": 1.516083800832676e-05,
4166
+ "loss": 1.1042,
4167
+ "step": 594
4168
+ },
4169
+ {
4170
+ "epoch": 2.983729662077597,
4171
+ "grad_norm": 3.1283183097839355,
4172
+ "learning_rate": 1.488334340023669e-05,
4173
+ "loss": 1.1628,
4174
+ "step": 595
4175
+ },
4176
+ {
4177
+ "epoch": 2.9887359198998746,
4178
+ "grad_norm": 3.149639129638672,
4179
+ "learning_rate": 1.4608209723788835e-05,
4180
+ "loss": 1.2558,
4181
+ "step": 596
4182
+ },
4183
+ {
4184
+ "epoch": 2.9937421777221527,
4185
+ "grad_norm": 2.6690926551818848,
4186
+ "learning_rate": 1.4335444521911899e-05,
4187
+ "loss": 0.8779,
4188
+ "step": 597
4189
  }
4190
  ],
4191
  "logging_steps": 1,
 
4205
  "attributes": {}
4206
  }
4207
  },
4208
+ "total_flos": 1.8782958184195686e+18,
4209
  "train_batch_size": 4,
4210
  "trial_name": null,
4211
  "trial_params": null