| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.1141552511415525, | |
| "eval_steps": 0, | |
| "global_step": 500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.00022831050228310502, | |
| "grad_norm": 59.67314529418945, | |
| "learning_rate": 0.0, | |
| "loss": 10.6752, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.00045662100456621003, | |
| "grad_norm": 17.693151473999023, | |
| "learning_rate": 0.0002559580248098155, | |
| "loss": 0.6248, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.0006849315068493151, | |
| "grad_norm": 18.046871185302734, | |
| "learning_rate": 0.00040568387108221287, | |
| "loss": 0.6678, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.0009132420091324201, | |
| "grad_norm": 0.9820695519447327, | |
| "learning_rate": 0.000511916049619631, | |
| "loss": 0.0564, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.001141552511415525, | |
| "grad_norm": 0.2884497046470642, | |
| "learning_rate": 0.000594316128917787, | |
| "loss": 0.034, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.0013698630136986301, | |
| "grad_norm": 0.0833420529961586, | |
| "learning_rate": 0.0006616418958920283, | |
| "loss": 0.0307, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.0015981735159817352, | |
| "grad_norm": 0.10984054207801819, | |
| "learning_rate": 0.0007185650207899778, | |
| "loss": 0.0311, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.0018264840182648401, | |
| "grad_norm": 0.13283254206180573, | |
| "learning_rate": 0.0007678740744294463, | |
| "loss": 0.032, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.002054794520547945, | |
| "grad_norm": 0.14496515691280365, | |
| "learning_rate": 0.0008113677421644257, | |
| "loss": 0.0303, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.00228310502283105, | |
| "grad_norm": 0.14172254502773285, | |
| "learning_rate": 0.0008502741537276026, | |
| "loss": 0.0294, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.002511415525114155, | |
| "grad_norm": 0.11957576870918274, | |
| "learning_rate": 0.0008854692840710254, | |
| "loss": 0.0302, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.0027397260273972603, | |
| "grad_norm": 0.11714160442352295, | |
| "learning_rate": 0.0009175999207018438, | |
| "loss": 0.0283, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.0029680365296803654, | |
| "grad_norm": 0.12587222456932068, | |
| "learning_rate": 0.0009471572411831842, | |
| "loss": 0.0284, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.0031963470319634705, | |
| "grad_norm": 0.13972364366054535, | |
| "learning_rate": 0.0009745230455997932, | |
| "loss": 0.0292, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.003424657534246575, | |
| "grad_norm": 0.1309085339307785, | |
| "learning_rate": 0.0009999999999999998, | |
| "loss": 0.0272, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.0036529680365296802, | |
| "grad_norm": 0.18927231431007385, | |
| "learning_rate": 0.001, | |
| "loss": 0.0254, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.0038812785388127853, | |
| "grad_norm": 0.16019679605960846, | |
| "learning_rate": 0.001, | |
| "loss": 0.0285, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.00410958904109589, | |
| "grad_norm": 0.14211571216583252, | |
| "learning_rate": 0.001, | |
| "loss": 0.026, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.0043378995433789955, | |
| "grad_norm": 0.17546725273132324, | |
| "learning_rate": 0.001, | |
| "loss": 0.0271, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.0045662100456621, | |
| "grad_norm": 0.12021715939044952, | |
| "learning_rate": 0.001, | |
| "loss": 0.024, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.004794520547945206, | |
| "grad_norm": 0.13626410067081451, | |
| "learning_rate": 0.001, | |
| "loss": 0.0261, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.00502283105022831, | |
| "grad_norm": 0.12207438051700592, | |
| "learning_rate": 0.001, | |
| "loss": 0.0244, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.005251141552511416, | |
| "grad_norm": 0.17474311590194702, | |
| "learning_rate": 0.001, | |
| "loss": 0.0237, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.005479452054794521, | |
| "grad_norm": 0.042765919119119644, | |
| "learning_rate": 0.001, | |
| "loss": 0.0206, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.005707762557077625, | |
| "grad_norm": 0.0823250487446785, | |
| "learning_rate": 0.001, | |
| "loss": 0.0218, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.005936073059360731, | |
| "grad_norm": 0.09036653488874435, | |
| "learning_rate": 0.001, | |
| "loss": 0.0211, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.0061643835616438354, | |
| "grad_norm": 0.06528954952955246, | |
| "learning_rate": 0.001, | |
| "loss": 0.0205, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.006392694063926941, | |
| "grad_norm": 0.07076761871576309, | |
| "learning_rate": 0.001, | |
| "loss": 0.0205, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.006621004566210046, | |
| "grad_norm": 0.08131472766399384, | |
| "learning_rate": 0.001, | |
| "loss": 0.0191, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.00684931506849315, | |
| "grad_norm": 0.097812220454216, | |
| "learning_rate": 0.001, | |
| "loss": 0.0191, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.007077625570776256, | |
| "grad_norm": 0.06373079121112823, | |
| "learning_rate": 0.001, | |
| "loss": 0.0191, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.0073059360730593605, | |
| "grad_norm": 0.05190230533480644, | |
| "learning_rate": 0.001, | |
| "loss": 0.018, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.007534246575342466, | |
| "grad_norm": 0.059811294078826904, | |
| "learning_rate": 0.001, | |
| "loss": 0.0191, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.007762557077625571, | |
| "grad_norm": 0.06886769086122513, | |
| "learning_rate": 0.001, | |
| "loss": 0.0172, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.007990867579908675, | |
| "grad_norm": 0.06065753847360611, | |
| "learning_rate": 0.001, | |
| "loss": 0.0179, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.00821917808219178, | |
| "grad_norm": 0.047076545655727386, | |
| "learning_rate": 0.001, | |
| "loss": 0.0165, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.008447488584474886, | |
| "grad_norm": 0.07710444182157516, | |
| "learning_rate": 0.001, | |
| "loss": 0.016, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.008675799086757991, | |
| "grad_norm": 0.050819285213947296, | |
| "learning_rate": 0.001, | |
| "loss": 0.0161, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.008904109589041096, | |
| "grad_norm": 0.04452894255518913, | |
| "learning_rate": 0.001, | |
| "loss": 0.0148, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.0091324200913242, | |
| "grad_norm": 0.06119012087583542, | |
| "learning_rate": 0.001, | |
| "loss": 0.0147, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.009360730593607305, | |
| "grad_norm": 0.043577950447797775, | |
| "learning_rate": 0.001, | |
| "loss": 0.0134, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.009589041095890411, | |
| "grad_norm": 0.06228714436292648, | |
| "learning_rate": 0.001, | |
| "loss": 0.0134, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.009817351598173516, | |
| "grad_norm": 0.08107709139585495, | |
| "learning_rate": 0.001, | |
| "loss": 0.0119, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.01004566210045662, | |
| "grad_norm": 0.08609241992235184, | |
| "learning_rate": 0.001, | |
| "loss": 0.0117, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.010273972602739725, | |
| "grad_norm": 0.08933087438344955, | |
| "learning_rate": 0.001, | |
| "loss": 0.0101, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.010502283105022832, | |
| "grad_norm": 0.23321422934532166, | |
| "learning_rate": 0.001, | |
| "loss": 0.0103, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.010730593607305937, | |
| "grad_norm": 0.1518358290195465, | |
| "learning_rate": 0.001, | |
| "loss": 0.0101, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.010958904109589041, | |
| "grad_norm": 0.15060600638389587, | |
| "learning_rate": 0.001, | |
| "loss": 0.009, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.011187214611872146, | |
| "grad_norm": 0.2696841359138489, | |
| "learning_rate": 0.001, | |
| "loss": 0.0087, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.01141552511415525, | |
| "grad_norm": 0.08441965281963348, | |
| "learning_rate": 0.001, | |
| "loss": 0.009, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.011643835616438357, | |
| "grad_norm": 0.1832842081785202, | |
| "learning_rate": 0.001, | |
| "loss": 0.0199, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.011872146118721462, | |
| "grad_norm": 0.21883782744407654, | |
| "learning_rate": 0.001, | |
| "loss": 0.0158, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.012100456621004566, | |
| "grad_norm": 12.722305297851562, | |
| "learning_rate": 0.001, | |
| "loss": 0.0915, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.012328767123287671, | |
| "grad_norm": 0.2270480841398239, | |
| "learning_rate": 0.001, | |
| "loss": 0.0383, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.012557077625570776, | |
| "grad_norm": 1.0806418657302856, | |
| "learning_rate": 0.001, | |
| "loss": 0.072, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.012785388127853882, | |
| "grad_norm": 0.42152509093284607, | |
| "learning_rate": 0.001, | |
| "loss": 0.051, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.013013698630136987, | |
| "grad_norm": 0.19152699410915375, | |
| "learning_rate": 0.001, | |
| "loss": 0.0437, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.013242009132420091, | |
| "grad_norm": 0.15559057891368866, | |
| "learning_rate": 0.001, | |
| "loss": 0.0407, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.013470319634703196, | |
| "grad_norm": 0.18103821575641632, | |
| "learning_rate": 0.001, | |
| "loss": 0.0378, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.0136986301369863, | |
| "grad_norm": 0.2188289612531662, | |
| "learning_rate": 0.001, | |
| "loss": 0.0382, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.013926940639269407, | |
| "grad_norm": 0.22403009235858917, | |
| "learning_rate": 0.001, | |
| "loss": 0.0368, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.014155251141552512, | |
| "grad_norm": 0.23726648092269897, | |
| "learning_rate": 0.001, | |
| "loss": 0.0308, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.014383561643835616, | |
| "grad_norm": 0.5590624809265137, | |
| "learning_rate": 0.001, | |
| "loss": 0.0265, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.014611872146118721, | |
| "grad_norm": 0.20665256679058075, | |
| "learning_rate": 0.001, | |
| "loss": 0.0249, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.014840182648401826, | |
| "grad_norm": 0.2618805170059204, | |
| "learning_rate": 0.001, | |
| "loss": 0.0241, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.015068493150684932, | |
| "grad_norm": 0.2558732330799103, | |
| "learning_rate": 0.001, | |
| "loss": 0.0222, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.015296803652968037, | |
| "grad_norm": 0.24830466508865356, | |
| "learning_rate": 0.001, | |
| "loss": 0.0234, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.015525114155251141, | |
| "grad_norm": 0.658237874507904, | |
| "learning_rate": 0.001, | |
| "loss": 0.0205, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.015753424657534248, | |
| "grad_norm": 0.264330118894577, | |
| "learning_rate": 0.001, | |
| "loss": 0.0225, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.01598173515981735, | |
| "grad_norm": 0.2591581642627716, | |
| "learning_rate": 0.001, | |
| "loss": 0.0243, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.016210045662100457, | |
| "grad_norm": 0.20444399118423462, | |
| "learning_rate": 0.001, | |
| "loss": 0.0209, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.01643835616438356, | |
| "grad_norm": 0.15570659935474396, | |
| "learning_rate": 0.001, | |
| "loss": 0.0176, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.016666666666666666, | |
| "grad_norm": 0.9550731778144836, | |
| "learning_rate": 0.001, | |
| "loss": 0.0209, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.016894977168949773, | |
| "grad_norm": 0.17412568628787994, | |
| "learning_rate": 0.001, | |
| "loss": 0.0173, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.017123287671232876, | |
| "grad_norm": 0.17629070580005646, | |
| "learning_rate": 0.001, | |
| "loss": 0.0188, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.017351598173515982, | |
| "grad_norm": 0.1633068323135376, | |
| "learning_rate": 0.001, | |
| "loss": 0.02, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.017579908675799085, | |
| "grad_norm": 0.15935851633548737, | |
| "learning_rate": 0.001, | |
| "loss": 0.0168, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.01780821917808219, | |
| "grad_norm": 0.12234501540660858, | |
| "learning_rate": 0.001, | |
| "loss": 0.0154, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.018036529680365298, | |
| "grad_norm": 0.21797019243240356, | |
| "learning_rate": 0.001, | |
| "loss": 0.0134, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.0182648401826484, | |
| "grad_norm": 0.14621035754680634, | |
| "learning_rate": 0.001, | |
| "loss": 0.013, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.018493150684931507, | |
| "grad_norm": 0.07467932254076004, | |
| "learning_rate": 0.001, | |
| "loss": 0.0118, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.01872146118721461, | |
| "grad_norm": 0.07849911600351334, | |
| "learning_rate": 0.001, | |
| "loss": 0.0109, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.018949771689497717, | |
| "grad_norm": 0.12953932583332062, | |
| "learning_rate": 0.001, | |
| "loss": 0.0108, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.019178082191780823, | |
| "grad_norm": 0.07049839198589325, | |
| "learning_rate": 0.001, | |
| "loss": 0.0101, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.019406392694063926, | |
| "grad_norm": 0.06369508057832718, | |
| "learning_rate": 0.001, | |
| "loss": 0.0095, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.019634703196347032, | |
| "grad_norm": 0.057968154549598694, | |
| "learning_rate": 0.001, | |
| "loss": 0.0095, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.01986301369863014, | |
| "grad_norm": 0.07910202443599701, | |
| "learning_rate": 0.001, | |
| "loss": 0.0097, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.02009132420091324, | |
| "grad_norm": 0.049049049615859985, | |
| "learning_rate": 0.001, | |
| "loss": 0.009, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.020319634703196348, | |
| "grad_norm": 0.041860196739435196, | |
| "learning_rate": 0.001, | |
| "loss": 0.008, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.02054794520547945, | |
| "grad_norm": 0.056602053344249725, | |
| "learning_rate": 0.001, | |
| "loss": 0.0093, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.020776255707762557, | |
| "grad_norm": 0.08956869691610336, | |
| "learning_rate": 0.001, | |
| "loss": 0.0107, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.021004566210045664, | |
| "grad_norm": 0.033224668353796005, | |
| "learning_rate": 0.001, | |
| "loss": 0.007, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.021232876712328767, | |
| "grad_norm": 0.047221846878528595, | |
| "learning_rate": 0.001, | |
| "loss": 0.0065, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.021461187214611873, | |
| "grad_norm": 0.05241613835096359, | |
| "learning_rate": 0.001, | |
| "loss": 0.0073, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.021689497716894976, | |
| "grad_norm": 0.05120820179581642, | |
| "learning_rate": 0.001, | |
| "loss": 0.0075, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.021917808219178082, | |
| "grad_norm": 0.042824823409318924, | |
| "learning_rate": 0.001, | |
| "loss": 0.0072, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.02214611872146119, | |
| "grad_norm": 0.037190262228250504, | |
| "learning_rate": 0.001, | |
| "loss": 0.0061, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.02237442922374429, | |
| "grad_norm": 0.03563378378748894, | |
| "learning_rate": 0.001, | |
| "loss": 0.007, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.022602739726027398, | |
| "grad_norm": 0.03606602922081947, | |
| "learning_rate": 0.001, | |
| "loss": 0.0059, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.0228310502283105, | |
| "grad_norm": 0.03840276971459389, | |
| "learning_rate": 0.001, | |
| "loss": 0.0061, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.023059360730593607, | |
| "grad_norm": 0.12917055189609528, | |
| "learning_rate": 0.001, | |
| "loss": 0.0103, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.023287671232876714, | |
| "grad_norm": 0.09650158882141113, | |
| "learning_rate": 0.001, | |
| "loss": 0.0092, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.023515981735159817, | |
| "grad_norm": 0.7314733862876892, | |
| "learning_rate": 0.001, | |
| "loss": 0.0113, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.023744292237442923, | |
| "grad_norm": 0.1915358155965805, | |
| "learning_rate": 0.001, | |
| "loss": 0.0094, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.023972602739726026, | |
| "grad_norm": 0.23454691469669342, | |
| "learning_rate": 0.001, | |
| "loss": 0.014, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.024200913242009132, | |
| "grad_norm": 0.1961510330438614, | |
| "learning_rate": 0.001, | |
| "loss": 0.0132, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.02442922374429224, | |
| "grad_norm": 0.12320326268672943, | |
| "learning_rate": 0.001, | |
| "loss": 0.0097, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.024657534246575342, | |
| "grad_norm": 0.05942022427916527, | |
| "learning_rate": 0.001, | |
| "loss": 0.0093, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.024885844748858448, | |
| "grad_norm": 0.04996173083782196, | |
| "learning_rate": 0.001, | |
| "loss": 0.008, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.02511415525114155, | |
| "grad_norm": 0.048785947263240814, | |
| "learning_rate": 0.001, | |
| "loss": 0.0087, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.025342465753424658, | |
| "grad_norm": 0.15529130399227142, | |
| "learning_rate": 0.001, | |
| "loss": 0.0094, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.025570776255707764, | |
| "grad_norm": 0.06682206690311432, | |
| "learning_rate": 0.001, | |
| "loss": 0.0087, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.025799086757990867, | |
| "grad_norm": 0.07254649698734283, | |
| "learning_rate": 0.001, | |
| "loss": 0.0098, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.026027397260273973, | |
| "grad_norm": 0.03909542039036751, | |
| "learning_rate": 0.001, | |
| "loss": 0.0065, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.026255707762557076, | |
| "grad_norm": 0.03716771677136421, | |
| "learning_rate": 0.001, | |
| "loss": 0.0086, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.026484018264840183, | |
| "grad_norm": 0.04341251775622368, | |
| "learning_rate": 0.001, | |
| "loss": 0.007, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.02671232876712329, | |
| "grad_norm": 0.0455278642475605, | |
| "learning_rate": 0.001, | |
| "loss": 0.0069, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.026940639269406392, | |
| "grad_norm": 0.0869159922003746, | |
| "learning_rate": 0.001, | |
| "loss": 0.0086, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.0271689497716895, | |
| "grad_norm": 0.05491505563259125, | |
| "learning_rate": 0.001, | |
| "loss": 0.0068, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.0273972602739726, | |
| "grad_norm": 0.05067432299256325, | |
| "learning_rate": 0.001, | |
| "loss": 0.0062, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.027625570776255708, | |
| "grad_norm": 0.06873013079166412, | |
| "learning_rate": 0.001, | |
| "loss": 0.008, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.027853881278538814, | |
| "grad_norm": 0.03151897341012955, | |
| "learning_rate": 0.001, | |
| "loss": 0.0055, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.028082191780821917, | |
| "grad_norm": 0.027348244562745094, | |
| "learning_rate": 0.001, | |
| "loss": 0.006, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.028310502283105023, | |
| "grad_norm": 0.04307318106293678, | |
| "learning_rate": 0.001, | |
| "loss": 0.0064, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.028538812785388126, | |
| "grad_norm": 0.0409172885119915, | |
| "learning_rate": 0.001, | |
| "loss": 0.0065, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.028767123287671233, | |
| "grad_norm": 0.042198970913887024, | |
| "learning_rate": 0.001, | |
| "loss": 0.0072, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.02899543378995434, | |
| "grad_norm": 0.046845417469739914, | |
| "learning_rate": 0.001, | |
| "loss": 0.0067, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.029223744292237442, | |
| "grad_norm": 0.03862365707755089, | |
| "learning_rate": 0.001, | |
| "loss": 0.0067, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.02945205479452055, | |
| "grad_norm": 0.04204321652650833, | |
| "learning_rate": 0.001, | |
| "loss": 0.0074, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.02968036529680365, | |
| "grad_norm": 0.03613033518195152, | |
| "learning_rate": 0.001, | |
| "loss": 0.0067, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.029908675799086758, | |
| "grad_norm": 0.03899417817592621, | |
| "learning_rate": 0.001, | |
| "loss": 0.0061, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.030136986301369864, | |
| "grad_norm": 0.03047838620841503, | |
| "learning_rate": 0.001, | |
| "loss": 0.0059, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.030365296803652967, | |
| "grad_norm": 0.04626467451453209, | |
| "learning_rate": 0.001, | |
| "loss": 0.0061, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.030593607305936073, | |
| "grad_norm": 0.04004530981183052, | |
| "learning_rate": 0.001, | |
| "loss": 0.0077, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.030821917808219176, | |
| "grad_norm": 0.03990226984024048, | |
| "learning_rate": 0.001, | |
| "loss": 0.0061, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.031050228310502283, | |
| "grad_norm": 0.035800885409116745, | |
| "learning_rate": 0.001, | |
| "loss": 0.0042, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.03127853881278539, | |
| "grad_norm": 0.03377184644341469, | |
| "learning_rate": 0.001, | |
| "loss": 0.0044, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.031506849315068496, | |
| "grad_norm": 0.031017042696475983, | |
| "learning_rate": 0.001, | |
| "loss": 0.0044, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.031735159817351595, | |
| "grad_norm": 0.027331147342920303, | |
| "learning_rate": 0.001, | |
| "loss": 0.0039, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.0319634703196347, | |
| "grad_norm": 0.034048646688461304, | |
| "learning_rate": 0.001, | |
| "loss": 0.0042, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.03219178082191781, | |
| "grad_norm": 0.03277864679694176, | |
| "learning_rate": 0.001, | |
| "loss": 0.0053, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.032420091324200914, | |
| "grad_norm": 0.04241342470049858, | |
| "learning_rate": 0.001, | |
| "loss": 0.0041, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.03264840182648402, | |
| "grad_norm": 0.026137417182326317, | |
| "learning_rate": 0.001, | |
| "loss": 0.0034, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.03287671232876712, | |
| "grad_norm": 0.03562963008880615, | |
| "learning_rate": 0.001, | |
| "loss": 0.0035, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.033105022831050226, | |
| "grad_norm": 0.026813900098204613, | |
| "learning_rate": 0.001, | |
| "loss": 0.0031, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.03333333333333333, | |
| "grad_norm": 0.030897343531250954, | |
| "learning_rate": 0.001, | |
| "loss": 0.0044, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.03356164383561644, | |
| "grad_norm": 0.02891898714005947, | |
| "learning_rate": 0.001, | |
| "loss": 0.0034, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.033789954337899546, | |
| "grad_norm": 0.03819667547941208, | |
| "learning_rate": 0.001, | |
| "loss": 0.0033, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.034018264840182645, | |
| "grad_norm": 0.02293401025235653, | |
| "learning_rate": 0.001, | |
| "loss": 0.0025, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.03424657534246575, | |
| "grad_norm": 0.02600831165909767, | |
| "learning_rate": 0.001, | |
| "loss": 0.0031, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.03447488584474886, | |
| "grad_norm": 0.040420051664114, | |
| "learning_rate": 0.001, | |
| "loss": 0.0051, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.034703196347031964, | |
| "grad_norm": 0.03907687962055206, | |
| "learning_rate": 0.001, | |
| "loss": 0.0047, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.03493150684931507, | |
| "grad_norm": 0.03037801943719387, | |
| "learning_rate": 0.001, | |
| "loss": 0.0034, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.03515981735159817, | |
| "grad_norm": 0.05104570835828781, | |
| "learning_rate": 0.001, | |
| "loss": 0.0061, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.03538812785388128, | |
| "grad_norm": 0.033460833132267, | |
| "learning_rate": 0.001, | |
| "loss": 0.0035, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.03561643835616438, | |
| "grad_norm": 0.034624133259058, | |
| "learning_rate": 0.001, | |
| "loss": 0.0038, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.03584474885844749, | |
| "grad_norm": 0.03363336622714996, | |
| "learning_rate": 0.001, | |
| "loss": 0.0046, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.036073059360730596, | |
| "grad_norm": 0.03651309013366699, | |
| "learning_rate": 0.001, | |
| "loss": 0.0042, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.036301369863013695, | |
| "grad_norm": 0.031121717765927315, | |
| "learning_rate": 0.001, | |
| "loss": 0.0034, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.0365296803652968, | |
| "grad_norm": 0.03925270959734917, | |
| "learning_rate": 0.001, | |
| "loss": 0.0045, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.03675799086757991, | |
| "grad_norm": 0.02922016754746437, | |
| "learning_rate": 0.001, | |
| "loss": 0.004, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.036986301369863014, | |
| "grad_norm": 0.03618766367435455, | |
| "learning_rate": 0.001, | |
| "loss": 0.004, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.03721461187214612, | |
| "grad_norm": 0.05399168282747269, | |
| "learning_rate": 0.001, | |
| "loss": 0.0049, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.03744292237442922, | |
| "grad_norm": 0.047811247408390045, | |
| "learning_rate": 0.001, | |
| "loss": 0.0045, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.03767123287671233, | |
| "grad_norm": 0.041643090546131134, | |
| "learning_rate": 0.001, | |
| "loss": 0.0038, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.03789954337899543, | |
| "grad_norm": 0.03867914155125618, | |
| "learning_rate": 0.001, | |
| "loss": 0.0045, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.03812785388127854, | |
| "grad_norm": 0.0361204594373703, | |
| "learning_rate": 0.001, | |
| "loss": 0.0041, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.038356164383561646, | |
| "grad_norm": 0.036205410957336426, | |
| "learning_rate": 0.001, | |
| "loss": 0.0048, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.03858447488584475, | |
| "grad_norm": 0.03310992196202278, | |
| "learning_rate": 0.001, | |
| "loss": 0.0032, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.03881278538812785, | |
| "grad_norm": 0.027686715126037598, | |
| "learning_rate": 0.001, | |
| "loss": 0.002, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.03904109589041096, | |
| "grad_norm": 0.029249897226691246, | |
| "learning_rate": 0.001, | |
| "loss": 0.0038, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.039269406392694065, | |
| "grad_norm": 0.03591005504131317, | |
| "learning_rate": 0.001, | |
| "loss": 0.0035, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.03949771689497717, | |
| "grad_norm": 0.030710754916071892, | |
| "learning_rate": 0.001, | |
| "loss": 0.0029, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.03972602739726028, | |
| "grad_norm": 0.03295068442821503, | |
| "learning_rate": 0.001, | |
| "loss": 0.002, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.03995433789954338, | |
| "grad_norm": 0.02918722666800022, | |
| "learning_rate": 0.001, | |
| "loss": 0.0032, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.04018264840182648, | |
| "grad_norm": 0.035701602697372437, | |
| "learning_rate": 0.001, | |
| "loss": 0.004, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.04041095890410959, | |
| "grad_norm": 0.03620489314198494, | |
| "learning_rate": 0.001, | |
| "loss": 0.0027, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.040639269406392696, | |
| "grad_norm": 0.04025309905409813, | |
| "learning_rate": 0.001, | |
| "loss": 0.0036, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.0408675799086758, | |
| "grad_norm": 0.03256874904036522, | |
| "learning_rate": 0.001, | |
| "loss": 0.0045, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.0410958904109589, | |
| "grad_norm": 0.03545399010181427, | |
| "learning_rate": 0.001, | |
| "loss": 0.003, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.04132420091324201, | |
| "grad_norm": 0.04845140874385834, | |
| "learning_rate": 0.001, | |
| "loss": 0.0047, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.041552511415525115, | |
| "grad_norm": 0.045855190604925156, | |
| "learning_rate": 0.001, | |
| "loss": 0.0041, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.04178082191780822, | |
| "grad_norm": 0.026962406933307648, | |
| "learning_rate": 0.001, | |
| "loss": 0.0028, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.04200913242009133, | |
| "grad_norm": 0.028487997129559517, | |
| "learning_rate": 0.001, | |
| "loss": 0.0039, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.04223744292237443, | |
| "grad_norm": 0.038144659250974655, | |
| "learning_rate": 0.001, | |
| "loss": 0.0036, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.04246575342465753, | |
| "grad_norm": 0.0443580225110054, | |
| "learning_rate": 0.001, | |
| "loss": 0.0038, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.04269406392694064, | |
| "grad_norm": 0.035410862416028976, | |
| "learning_rate": 0.001, | |
| "loss": 0.0036, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.042922374429223746, | |
| "grad_norm": 0.0394715741276741, | |
| "learning_rate": 0.001, | |
| "loss": 0.0028, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.04315068493150685, | |
| "grad_norm": 0.03207629173994064, | |
| "learning_rate": 0.001, | |
| "loss": 0.0028, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.04337899543378995, | |
| "grad_norm": 0.02908760868012905, | |
| "learning_rate": 0.001, | |
| "loss": 0.002, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.04360730593607306, | |
| "grad_norm": 0.0272049680352211, | |
| "learning_rate": 0.001, | |
| "loss": 0.0028, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.043835616438356165, | |
| "grad_norm": 0.02282743901014328, | |
| "learning_rate": 0.001, | |
| "loss": 0.0015, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.04406392694063927, | |
| "grad_norm": 0.024958152323961258, | |
| "learning_rate": 0.001, | |
| "loss": 0.0017, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.04429223744292238, | |
| "grad_norm": 0.029786400496959686, | |
| "learning_rate": 0.001, | |
| "loss": 0.0031, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.04452054794520548, | |
| "grad_norm": 0.023932697251439095, | |
| "learning_rate": 0.001, | |
| "loss": 0.0025, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.04474885844748858, | |
| "grad_norm": 0.02262377366423607, | |
| "learning_rate": 0.001, | |
| "loss": 0.0021, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.04497716894977169, | |
| "grad_norm": 0.033051978796720505, | |
| "learning_rate": 0.001, | |
| "loss": 0.0026, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.045205479452054796, | |
| "grad_norm": 0.029031749814748764, | |
| "learning_rate": 0.001, | |
| "loss": 0.0022, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.0454337899543379, | |
| "grad_norm": 0.030305176973342896, | |
| "learning_rate": 0.001, | |
| "loss": 0.0026, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.045662100456621, | |
| "grad_norm": 0.02067619003355503, | |
| "learning_rate": 0.001, | |
| "loss": 0.0014, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.04589041095890411, | |
| "grad_norm": 0.03859075903892517, | |
| "learning_rate": 0.001, | |
| "loss": 0.0041, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.046118721461187215, | |
| "grad_norm": 0.03318578004837036, | |
| "learning_rate": 0.001, | |
| "loss": 0.0024, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.04634703196347032, | |
| "grad_norm": 0.03525965288281441, | |
| "learning_rate": 0.001, | |
| "loss": 0.0029, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.04657534246575343, | |
| "grad_norm": 0.041064050048589706, | |
| "learning_rate": 0.001, | |
| "loss": 0.0045, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.04680365296803653, | |
| "grad_norm": 0.045857496559619904, | |
| "learning_rate": 0.001, | |
| "loss": 0.0033, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.047031963470319633, | |
| "grad_norm": 0.037904538214206696, | |
| "learning_rate": 0.001, | |
| "loss": 0.0035, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.04726027397260274, | |
| "grad_norm": 0.03063504584133625, | |
| "learning_rate": 0.001, | |
| "loss": 0.0027, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.047488584474885846, | |
| "grad_norm": 0.040485553443431854, | |
| "learning_rate": 0.001, | |
| "loss": 0.004, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.04771689497716895, | |
| "grad_norm": 0.034435346722602844, | |
| "learning_rate": 0.001, | |
| "loss": 0.0046, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.04794520547945205, | |
| "grad_norm": 0.027862414717674255, | |
| "learning_rate": 0.001, | |
| "loss": 0.002, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.04817351598173516, | |
| "grad_norm": 0.034713245928287506, | |
| "learning_rate": 0.001, | |
| "loss": 0.0039, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.048401826484018265, | |
| "grad_norm": 0.038781870156526566, | |
| "learning_rate": 0.001, | |
| "loss": 0.0046, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.04863013698630137, | |
| "grad_norm": 0.025890646502375603, | |
| "learning_rate": 0.001, | |
| "loss": 0.0022, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.04885844748858448, | |
| "grad_norm": 0.0285344235599041, | |
| "learning_rate": 0.001, | |
| "loss": 0.0025, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.04908675799086758, | |
| "grad_norm": 0.032012905925512314, | |
| "learning_rate": 0.001, | |
| "loss": 0.003, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.049315068493150684, | |
| "grad_norm": 0.04779508709907532, | |
| "learning_rate": 0.001, | |
| "loss": 0.0037, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.04954337899543379, | |
| "grad_norm": 0.039367783814668655, | |
| "learning_rate": 0.001, | |
| "loss": 0.0027, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.049771689497716896, | |
| "grad_norm": 0.02745324745774269, | |
| "learning_rate": 0.001, | |
| "loss": 0.0021, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 0.03268812596797943, | |
| "learning_rate": 0.001, | |
| "loss": 0.0033, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.0502283105022831, | |
| "grad_norm": 0.023665225133299828, | |
| "learning_rate": 0.001, | |
| "loss": 0.0028, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.05045662100456621, | |
| "grad_norm": 0.0373012013733387, | |
| "learning_rate": 0.001, | |
| "loss": 0.0039, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.050684931506849315, | |
| "grad_norm": 0.033793918788433075, | |
| "learning_rate": 0.001, | |
| "loss": 0.0036, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.05091324200913242, | |
| "grad_norm": 0.0297444686293602, | |
| "learning_rate": 0.001, | |
| "loss": 0.003, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.05114155251141553, | |
| "grad_norm": 0.05024491623044014, | |
| "learning_rate": 0.001, | |
| "loss": 0.0035, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.05136986301369863, | |
| "grad_norm": 0.03143681213259697, | |
| "learning_rate": 0.001, | |
| "loss": 0.0032, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.051598173515981734, | |
| "grad_norm": 0.023645315319299698, | |
| "learning_rate": 0.001, | |
| "loss": 0.0025, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.05182648401826484, | |
| "grad_norm": 0.02782478556036949, | |
| "learning_rate": 0.001, | |
| "loss": 0.002, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.052054794520547946, | |
| "grad_norm": 0.0307586882263422, | |
| "learning_rate": 0.001, | |
| "loss": 0.0032, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.05228310502283105, | |
| "grad_norm": 0.04269454628229141, | |
| "learning_rate": 0.001, | |
| "loss": 0.0039, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.05251141552511415, | |
| "grad_norm": 0.035806287080049515, | |
| "learning_rate": 0.001, | |
| "loss": 0.003, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.05273972602739726, | |
| "grad_norm": 0.03528301417827606, | |
| "learning_rate": 0.001, | |
| "loss": 0.0032, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.052968036529680365, | |
| "grad_norm": 0.029358338564634323, | |
| "learning_rate": 0.001, | |
| "loss": 0.0025, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.05319634703196347, | |
| "grad_norm": 0.021077649667859077, | |
| "learning_rate": 0.001, | |
| "loss": 0.003, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.05342465753424658, | |
| "grad_norm": 0.029840657487511635, | |
| "learning_rate": 0.001, | |
| "loss": 0.0017, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.05365296803652968, | |
| "grad_norm": 0.028463926166296005, | |
| "learning_rate": 0.001, | |
| "loss": 0.0027, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.053881278538812784, | |
| "grad_norm": 0.026239361613988876, | |
| "learning_rate": 0.001, | |
| "loss": 0.0024, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.05410958904109589, | |
| "grad_norm": 0.02149251475930214, | |
| "learning_rate": 0.001, | |
| "loss": 0.0014, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.054337899543379, | |
| "grad_norm": 0.02750280313193798, | |
| "learning_rate": 0.001, | |
| "loss": 0.0021, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.0545662100456621, | |
| "grad_norm": 0.028853842988610268, | |
| "learning_rate": 0.001, | |
| "loss": 0.0024, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.0547945205479452, | |
| "grad_norm": 0.03062448836863041, | |
| "learning_rate": 0.001, | |
| "loss": 0.0028, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.05502283105022831, | |
| "grad_norm": 0.021715497598052025, | |
| "learning_rate": 0.001, | |
| "loss": 0.0017, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.055251141552511415, | |
| "grad_norm": 0.03351881727576256, | |
| "learning_rate": 0.001, | |
| "loss": 0.0025, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.05547945205479452, | |
| "grad_norm": 0.025600440800189972, | |
| "learning_rate": 0.001, | |
| "loss": 0.002, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.05570776255707763, | |
| "grad_norm": 0.03094620630145073, | |
| "learning_rate": 0.001, | |
| "loss": 0.0018, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.05593607305936073, | |
| "grad_norm": 0.03529248386621475, | |
| "learning_rate": 0.001, | |
| "loss": 0.0023, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.056164383561643834, | |
| "grad_norm": 0.026421545073390007, | |
| "learning_rate": 0.001, | |
| "loss": 0.002, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.05639269406392694, | |
| "grad_norm": 0.018347790464758873, | |
| "learning_rate": 0.001, | |
| "loss": 0.0012, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 0.05662100456621005, | |
| "grad_norm": 0.02605101279914379, | |
| "learning_rate": 0.001, | |
| "loss": 0.0028, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.05684931506849315, | |
| "grad_norm": 0.027538320049643517, | |
| "learning_rate": 0.001, | |
| "loss": 0.003, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.05707762557077625, | |
| "grad_norm": 0.030089175328612328, | |
| "learning_rate": 0.001, | |
| "loss": 0.002, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.05730593607305936, | |
| "grad_norm": 0.02568584680557251, | |
| "learning_rate": 0.001, | |
| "loss": 0.0018, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 0.057534246575342465, | |
| "grad_norm": 0.043693918734788895, | |
| "learning_rate": 0.001, | |
| "loss": 0.0039, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.05776255707762557, | |
| "grad_norm": 0.025515882298350334, | |
| "learning_rate": 0.001, | |
| "loss": 0.0029, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 0.05799086757990868, | |
| "grad_norm": 0.023086579516530037, | |
| "learning_rate": 0.001, | |
| "loss": 0.002, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.05821917808219178, | |
| "grad_norm": 0.03552839159965515, | |
| "learning_rate": 0.001, | |
| "loss": 0.0031, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.058447488584474884, | |
| "grad_norm": 0.030602211132645607, | |
| "learning_rate": 0.001, | |
| "loss": 0.0028, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.05867579908675799, | |
| "grad_norm": 0.02757362276315689, | |
| "learning_rate": 0.001, | |
| "loss": 0.0021, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 0.0589041095890411, | |
| "grad_norm": 0.04006500914692879, | |
| "learning_rate": 0.001, | |
| "loss": 0.0025, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.0591324200913242, | |
| "grad_norm": 0.039859503507614136, | |
| "learning_rate": 0.001, | |
| "loss": 0.0025, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 0.0593607305936073, | |
| "grad_norm": 0.02268202416598797, | |
| "learning_rate": 0.001, | |
| "loss": 0.0019, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.05958904109589041, | |
| "grad_norm": 0.020849550142884254, | |
| "learning_rate": 0.001, | |
| "loss": 0.0019, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 0.059817351598173515, | |
| "grad_norm": 0.026384403929114342, | |
| "learning_rate": 0.001, | |
| "loss": 0.0022, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.06004566210045662, | |
| "grad_norm": 0.029226887971162796, | |
| "learning_rate": 0.001, | |
| "loss": 0.0024, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 0.06027397260273973, | |
| "grad_norm": 0.029352016746997833, | |
| "learning_rate": 0.001, | |
| "loss": 0.0027, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.06050228310502283, | |
| "grad_norm": 0.023828251287341118, | |
| "learning_rate": 0.001, | |
| "loss": 0.0022, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.060730593607305934, | |
| "grad_norm": 0.050515275448560715, | |
| "learning_rate": 0.001, | |
| "loss": 0.0065, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.06095890410958904, | |
| "grad_norm": 0.3609565198421478, | |
| "learning_rate": 0.001, | |
| "loss": 0.0033, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 0.06118721461187215, | |
| "grad_norm": 0.030405467376112938, | |
| "learning_rate": 0.001, | |
| "loss": 0.002, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.06141552511415525, | |
| "grad_norm": 0.07481672614812851, | |
| "learning_rate": 0.001, | |
| "loss": 0.0027, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 0.06164383561643835, | |
| "grad_norm": 0.09166887402534485, | |
| "learning_rate": 0.001, | |
| "loss": 0.003, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.06187214611872146, | |
| "grad_norm": 0.06070258840918541, | |
| "learning_rate": 0.001, | |
| "loss": 0.0038, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 0.062100456621004566, | |
| "grad_norm": 0.02546994574368, | |
| "learning_rate": 0.001, | |
| "loss": 0.0026, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.06232876712328767, | |
| "grad_norm": 0.028366973623633385, | |
| "learning_rate": 0.001, | |
| "loss": 0.0021, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 0.06255707762557078, | |
| "grad_norm": 0.02752639539539814, | |
| "learning_rate": 0.001, | |
| "loss": 0.0028, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.06278538812785388, | |
| "grad_norm": 0.02514069154858589, | |
| "learning_rate": 0.001, | |
| "loss": 0.0025, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.06301369863013699, | |
| "grad_norm": 0.03297794982790947, | |
| "learning_rate": 0.001, | |
| "loss": 0.0031, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.06324200913242009, | |
| "grad_norm": 0.03322751075029373, | |
| "learning_rate": 0.001, | |
| "loss": 0.0027, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 0.06347031963470319, | |
| "grad_norm": 0.028292890638113022, | |
| "learning_rate": 0.001, | |
| "loss": 0.0035, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.0636986301369863, | |
| "grad_norm": 0.04020245000720024, | |
| "learning_rate": 0.001, | |
| "loss": 0.0025, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 0.0639269406392694, | |
| "grad_norm": 0.03231251239776611, | |
| "learning_rate": 0.001, | |
| "loss": 0.0039, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.06415525114155252, | |
| "grad_norm": 0.0225644800812006, | |
| "learning_rate": 0.001, | |
| "loss": 0.0025, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 0.06438356164383562, | |
| "grad_norm": 0.028778597712516785, | |
| "learning_rate": 0.001, | |
| "loss": 0.0023, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.06461187214611872, | |
| "grad_norm": 0.02618185058236122, | |
| "learning_rate": 0.001, | |
| "loss": 0.0025, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 0.06484018264840183, | |
| "grad_norm": 0.03890310227870941, | |
| "learning_rate": 0.001, | |
| "loss": 0.0026, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.06506849315068493, | |
| "grad_norm": 0.029423601925373077, | |
| "learning_rate": 0.001, | |
| "loss": 0.0031, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.06529680365296804, | |
| "grad_norm": 0.04089478775858879, | |
| "learning_rate": 0.001, | |
| "loss": 0.0035, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.06552511415525114, | |
| "grad_norm": 0.031911611557006836, | |
| "learning_rate": 0.001, | |
| "loss": 0.0029, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 0.06575342465753424, | |
| "grad_norm": 0.02856455370783806, | |
| "learning_rate": 0.001, | |
| "loss": 0.0019, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.06598173515981735, | |
| "grad_norm": 0.02316523902118206, | |
| "learning_rate": 0.001, | |
| "loss": 0.0015, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 0.06621004566210045, | |
| "grad_norm": 0.021586967632174492, | |
| "learning_rate": 0.001, | |
| "loss": 0.002, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.06643835616438357, | |
| "grad_norm": 0.020875398069620132, | |
| "learning_rate": 0.001, | |
| "loss": 0.0021, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 0.06666666666666667, | |
| "grad_norm": 0.025591716170310974, | |
| "learning_rate": 0.001, | |
| "loss": 0.0027, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 0.06689497716894977, | |
| "grad_norm": 0.02905621938407421, | |
| "learning_rate": 0.001, | |
| "loss": 0.0021, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 0.06712328767123288, | |
| "grad_norm": 0.03460671007633209, | |
| "learning_rate": 0.001, | |
| "loss": 0.0031, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.06735159817351598, | |
| "grad_norm": 0.014558055438101292, | |
| "learning_rate": 0.001, | |
| "loss": 0.0008, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.06757990867579909, | |
| "grad_norm": 0.021651627495884895, | |
| "learning_rate": 0.001, | |
| "loss": 0.0021, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.06780821917808219, | |
| "grad_norm": 0.020275374874472618, | |
| "learning_rate": 0.001, | |
| "loss": 0.0012, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 0.06803652968036529, | |
| "grad_norm": 0.030108539387583733, | |
| "learning_rate": 0.001, | |
| "loss": 0.0021, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 0.0682648401826484, | |
| "grad_norm": 0.02870999090373516, | |
| "learning_rate": 0.001, | |
| "loss": 0.0016, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 0.0684931506849315, | |
| "grad_norm": 0.030189916491508484, | |
| "learning_rate": 0.001, | |
| "loss": 0.0027, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.06872146118721462, | |
| "grad_norm": 0.048917006701231, | |
| "learning_rate": 0.001, | |
| "loss": 0.0051, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 0.06894977168949772, | |
| "grad_norm": 0.0351158082485199, | |
| "learning_rate": 0.001, | |
| "loss": 0.003, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 0.06917808219178082, | |
| "grad_norm": 0.0304318368434906, | |
| "learning_rate": 0.001, | |
| "loss": 0.003, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 0.06940639269406393, | |
| "grad_norm": 0.02364553138613701, | |
| "learning_rate": 0.001, | |
| "loss": 0.0018, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 0.06963470319634703, | |
| "grad_norm": 0.025430144742131233, | |
| "learning_rate": 0.001, | |
| "loss": 0.0033, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.06986301369863014, | |
| "grad_norm": 0.028122954070568085, | |
| "learning_rate": 0.001, | |
| "loss": 0.0028, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 0.07009132420091324, | |
| "grad_norm": 0.04655618220567703, | |
| "learning_rate": 0.001, | |
| "loss": 0.0038, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 0.07031963470319634, | |
| "grad_norm": 0.03192426636815071, | |
| "learning_rate": 0.001, | |
| "loss": 0.0033, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 0.07054794520547945, | |
| "grad_norm": 0.03930205851793289, | |
| "learning_rate": 0.001, | |
| "loss": 0.0024, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 0.07077625570776255, | |
| "grad_norm": 0.0391114316880703, | |
| "learning_rate": 0.001, | |
| "loss": 0.0033, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.07100456621004567, | |
| "grad_norm": 0.02882283739745617, | |
| "learning_rate": 0.001, | |
| "loss": 0.0017, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 0.07123287671232877, | |
| "grad_norm": 0.025312229990959167, | |
| "learning_rate": 0.001, | |
| "loss": 0.0023, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 0.07146118721461187, | |
| "grad_norm": 0.03631848841905594, | |
| "learning_rate": 0.001, | |
| "loss": 0.0023, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 0.07168949771689498, | |
| "grad_norm": 0.02449788525700569, | |
| "learning_rate": 0.001, | |
| "loss": 0.0017, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 0.07191780821917808, | |
| "grad_norm": 0.0258337315171957, | |
| "learning_rate": 0.001, | |
| "loss": 0.0018, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.07214611872146119, | |
| "grad_norm": 0.023845955729484558, | |
| "learning_rate": 0.001, | |
| "loss": 0.0015, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 0.07237442922374429, | |
| "grad_norm": 0.024546071887016296, | |
| "learning_rate": 0.001, | |
| "loss": 0.0022, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 0.07260273972602739, | |
| "grad_norm": 0.0188372153788805, | |
| "learning_rate": 0.001, | |
| "loss": 0.0013, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 0.0728310502283105, | |
| "grad_norm": 0.03890606015920639, | |
| "learning_rate": 0.001, | |
| "loss": 0.0047, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 0.0730593607305936, | |
| "grad_norm": 0.02590329386293888, | |
| "learning_rate": 0.001, | |
| "loss": 0.0024, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.07328767123287672, | |
| "grad_norm": 0.036657921969890594, | |
| "learning_rate": 0.001, | |
| "loss": 0.0035, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 0.07351598173515982, | |
| "grad_norm": 0.04023008793592453, | |
| "learning_rate": 0.001, | |
| "loss": 0.0033, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 0.07374429223744292, | |
| "grad_norm": 0.025426125153899193, | |
| "learning_rate": 0.001, | |
| "loss": 0.0027, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 0.07397260273972603, | |
| "grad_norm": 0.02883792109787464, | |
| "learning_rate": 0.001, | |
| "loss": 0.0021, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 0.07420091324200913, | |
| "grad_norm": 0.02551659569144249, | |
| "learning_rate": 0.001, | |
| "loss": 0.0021, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.07442922374429224, | |
| "grad_norm": 0.023540591821074486, | |
| "learning_rate": 0.001, | |
| "loss": 0.0023, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 0.07465753424657534, | |
| "grad_norm": 0.02690877579152584, | |
| "learning_rate": 0.001, | |
| "loss": 0.0026, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 0.07488584474885844, | |
| "grad_norm": 0.020135624334216118, | |
| "learning_rate": 0.001, | |
| "loss": 0.0021, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 0.07511415525114155, | |
| "grad_norm": 0.026753783226013184, | |
| "learning_rate": 0.001, | |
| "loss": 0.003, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 0.07534246575342465, | |
| "grad_norm": 0.0383230559527874, | |
| "learning_rate": 0.001, | |
| "loss": 0.0031, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.07557077625570777, | |
| "grad_norm": 0.03493601456284523, | |
| "learning_rate": 0.001, | |
| "loss": 0.0033, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 0.07579908675799087, | |
| "grad_norm": 0.02847091108560562, | |
| "learning_rate": 0.001, | |
| "loss": 0.0036, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 0.07602739726027398, | |
| "grad_norm": 0.023921307176351547, | |
| "learning_rate": 0.001, | |
| "loss": 0.0029, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 0.07625570776255708, | |
| "grad_norm": 0.03113155998289585, | |
| "learning_rate": 0.001, | |
| "loss": 0.0022, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 0.07648401826484018, | |
| "grad_norm": 0.024777159094810486, | |
| "learning_rate": 0.001, | |
| "loss": 0.0024, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.07671232876712329, | |
| "grad_norm": 0.02515614964067936, | |
| "learning_rate": 0.001, | |
| "loss": 0.0024, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 0.07694063926940639, | |
| "grad_norm": 0.023284632712602615, | |
| "learning_rate": 0.001, | |
| "loss": 0.0016, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 0.0771689497716895, | |
| "grad_norm": 0.023549994453787804, | |
| "learning_rate": 0.001, | |
| "loss": 0.0025, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 0.0773972602739726, | |
| "grad_norm": 0.026529377326369286, | |
| "learning_rate": 0.001, | |
| "loss": 0.0017, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 0.0776255707762557, | |
| "grad_norm": 0.02118872106075287, | |
| "learning_rate": 0.001, | |
| "loss": 0.002, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.07785388127853882, | |
| "grad_norm": 0.0226143728941679, | |
| "learning_rate": 0.001, | |
| "loss": 0.0024, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 0.07808219178082192, | |
| "grad_norm": 0.022813035175204277, | |
| "learning_rate": 0.001, | |
| "loss": 0.0018, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 0.07831050228310503, | |
| "grad_norm": 0.019757017493247986, | |
| "learning_rate": 0.001, | |
| "loss": 0.0018, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 0.07853881278538813, | |
| "grad_norm": 0.02227397821843624, | |
| "learning_rate": 0.001, | |
| "loss": 0.0017, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 0.07876712328767123, | |
| "grad_norm": 0.022303001955151558, | |
| "learning_rate": 0.001, | |
| "loss": 0.0018, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.07899543378995434, | |
| "grad_norm": 0.025369267910718918, | |
| "learning_rate": 0.001, | |
| "loss": 0.0026, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 0.07922374429223744, | |
| "grad_norm": 0.022909611463546753, | |
| "learning_rate": 0.001, | |
| "loss": 0.0015, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 0.07945205479452055, | |
| "grad_norm": 0.02747984044253826, | |
| "learning_rate": 0.001, | |
| "loss": 0.0022, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 0.07968036529680365, | |
| "grad_norm": 0.028999097645282745, | |
| "learning_rate": 0.001, | |
| "loss": 0.003, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 0.07990867579908675, | |
| "grad_norm": 0.013709438033401966, | |
| "learning_rate": 0.001, | |
| "loss": 0.001, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.08013698630136987, | |
| "grad_norm": 0.03311995416879654, | |
| "learning_rate": 0.001, | |
| "loss": 0.0025, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 0.08036529680365297, | |
| "grad_norm": 0.030428579077124596, | |
| "learning_rate": 0.001, | |
| "loss": 0.0024, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 0.08059360730593608, | |
| "grad_norm": 0.02569733001291752, | |
| "learning_rate": 0.001, | |
| "loss": 0.0025, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 0.08082191780821918, | |
| "grad_norm": 0.03375837951898575, | |
| "learning_rate": 0.001, | |
| "loss": 0.0027, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 0.08105022831050228, | |
| "grad_norm": 0.02408471703529358, | |
| "learning_rate": 0.001, | |
| "loss": 0.0025, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.08127853881278539, | |
| "grad_norm": 0.025053909048438072, | |
| "learning_rate": 0.001, | |
| "loss": 0.0025, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 0.08150684931506849, | |
| "grad_norm": 0.03166033327579498, | |
| "learning_rate": 0.001, | |
| "loss": 0.0022, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 0.0817351598173516, | |
| "grad_norm": 0.023597661405801773, | |
| "learning_rate": 0.001, | |
| "loss": 0.0027, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 0.0819634703196347, | |
| "grad_norm": 0.02543063834309578, | |
| "learning_rate": 0.001, | |
| "loss": 0.0015, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 0.0821917808219178, | |
| "grad_norm": 0.024594414979219437, | |
| "learning_rate": 0.001, | |
| "loss": 0.0021, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.08242009132420092, | |
| "grad_norm": 0.026880159974098206, | |
| "learning_rate": 0.001, | |
| "loss": 0.0032, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 0.08264840182648402, | |
| "grad_norm": 0.0315290130674839, | |
| "learning_rate": 0.001, | |
| "loss": 0.0027, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 0.08287671232876713, | |
| "grad_norm": 0.027256738394498825, | |
| "learning_rate": 0.001, | |
| "loss": 0.0028, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 0.08310502283105023, | |
| "grad_norm": 0.022752612829208374, | |
| "learning_rate": 0.001, | |
| "loss": 0.0024, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 0.08333333333333333, | |
| "grad_norm": 0.013999447226524353, | |
| "learning_rate": 0.001, | |
| "loss": 0.0016, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.08356164383561644, | |
| "grad_norm": 0.026544874534010887, | |
| "learning_rate": 0.001, | |
| "loss": 0.0021, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 0.08378995433789954, | |
| "grad_norm": 0.018856002017855644, | |
| "learning_rate": 0.001, | |
| "loss": 0.0023, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 0.08401826484018265, | |
| "grad_norm": 0.04184157773852348, | |
| "learning_rate": 0.001, | |
| "loss": 0.0043, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 0.08424657534246575, | |
| "grad_norm": 0.027606133371591568, | |
| "learning_rate": 0.001, | |
| "loss": 0.0022, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 0.08447488584474885, | |
| "grad_norm": 0.0274574626237154, | |
| "learning_rate": 0.001, | |
| "loss": 0.0024, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.08470319634703197, | |
| "grad_norm": 0.029858067631721497, | |
| "learning_rate": 0.001, | |
| "loss": 0.0033, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 0.08493150684931507, | |
| "grad_norm": 0.026789812371134758, | |
| "learning_rate": 0.001, | |
| "loss": 0.0034, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 0.08515981735159818, | |
| "grad_norm": 0.029502468183636665, | |
| "learning_rate": 0.001, | |
| "loss": 0.0022, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 0.08538812785388128, | |
| "grad_norm": 0.025616176426410675, | |
| "learning_rate": 0.001, | |
| "loss": 0.0027, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 0.08561643835616438, | |
| "grad_norm": 0.016593433916568756, | |
| "learning_rate": 0.001, | |
| "loss": 0.0013, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.08584474885844749, | |
| "grad_norm": 0.026096921414136887, | |
| "learning_rate": 0.001, | |
| "loss": 0.0019, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 0.08607305936073059, | |
| "grad_norm": 0.034800466150045395, | |
| "learning_rate": 0.001, | |
| "loss": 0.0023, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 0.0863013698630137, | |
| "grad_norm": 0.025603458285331726, | |
| "learning_rate": 0.001, | |
| "loss": 0.0027, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 0.0865296803652968, | |
| "grad_norm": 0.01851038821041584, | |
| "learning_rate": 0.001, | |
| "loss": 0.0022, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 0.0867579908675799, | |
| "grad_norm": 0.028083520010113716, | |
| "learning_rate": 0.001, | |
| "loss": 0.0023, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.08698630136986302, | |
| "grad_norm": 0.022135423496365547, | |
| "learning_rate": 0.001, | |
| "loss": 0.0021, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 0.08721461187214612, | |
| "grad_norm": 0.02563360147178173, | |
| "learning_rate": 0.001, | |
| "loss": 0.003, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 0.08744292237442923, | |
| "grad_norm": 0.03189925476908684, | |
| "learning_rate": 0.001, | |
| "loss": 0.0022, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 0.08767123287671233, | |
| "grad_norm": 0.026175467297434807, | |
| "learning_rate": 0.001, | |
| "loss": 0.0019, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 0.08789954337899543, | |
| "grad_norm": 0.019512465223670006, | |
| "learning_rate": 0.001, | |
| "loss": 0.002, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.08812785388127854, | |
| "grad_norm": 0.013086398132145405, | |
| "learning_rate": 0.001, | |
| "loss": 0.0012, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 0.08835616438356164, | |
| "grad_norm": 0.018814057111740112, | |
| "learning_rate": 0.001, | |
| "loss": 0.0014, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 0.08858447488584476, | |
| "grad_norm": 0.018231388181447983, | |
| "learning_rate": 0.001, | |
| "loss": 0.0014, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 0.08881278538812785, | |
| "grad_norm": 0.0169826727360487, | |
| "learning_rate": 0.001, | |
| "loss": 0.0013, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 0.08904109589041095, | |
| "grad_norm": 0.03351948410272598, | |
| "learning_rate": 0.001, | |
| "loss": 0.0024, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.08926940639269407, | |
| "grad_norm": 0.023230386897921562, | |
| "learning_rate": 0.001, | |
| "loss": 0.0024, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 0.08949771689497717, | |
| "grad_norm": 0.02241365611553192, | |
| "learning_rate": 0.001, | |
| "loss": 0.0022, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 0.08972602739726028, | |
| "grad_norm": 0.021022368222475052, | |
| "learning_rate": 0.001, | |
| "loss": 0.0017, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 0.08995433789954338, | |
| "grad_norm": 0.022241264581680298, | |
| "learning_rate": 0.001, | |
| "loss": 0.0019, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 0.09018264840182648, | |
| "grad_norm": 0.02163674309849739, | |
| "learning_rate": 0.001, | |
| "loss": 0.0018, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.09041095890410959, | |
| "grad_norm": 0.020653806626796722, | |
| "learning_rate": 0.001, | |
| "loss": 0.0013, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 0.09063926940639269, | |
| "grad_norm": 0.020344195887446404, | |
| "learning_rate": 0.001, | |
| "loss": 0.0019, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 0.0908675799086758, | |
| "grad_norm": 0.015474921092391014, | |
| "learning_rate": 0.001, | |
| "loss": 0.001, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 0.0910958904109589, | |
| "grad_norm": 0.017434895038604736, | |
| "learning_rate": 0.001, | |
| "loss": 0.0012, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 0.091324200913242, | |
| "grad_norm": 0.02458396926522255, | |
| "learning_rate": 0.001, | |
| "loss": 0.0016, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.09155251141552512, | |
| "grad_norm": 0.03149225190281868, | |
| "learning_rate": 0.001, | |
| "loss": 0.0035, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 0.09178082191780822, | |
| "grad_norm": 0.026796750724315643, | |
| "learning_rate": 0.001, | |
| "loss": 0.0014, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 0.09200913242009133, | |
| "grad_norm": 0.020359905436635017, | |
| "learning_rate": 0.001, | |
| "loss": 0.0017, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 0.09223744292237443, | |
| "grad_norm": 0.024055240675807, | |
| "learning_rate": 0.001, | |
| "loss": 0.0013, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 0.09246575342465753, | |
| "grad_norm": 0.026445262134075165, | |
| "learning_rate": 0.001, | |
| "loss": 0.0019, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.09269406392694064, | |
| "grad_norm": 0.02413698472082615, | |
| "learning_rate": 0.001, | |
| "loss": 0.0023, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 0.09292237442922374, | |
| "grad_norm": 0.024934392422437668, | |
| "learning_rate": 0.001, | |
| "loss": 0.0023, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 0.09315068493150686, | |
| "grad_norm": 0.024041904136538506, | |
| "learning_rate": 0.001, | |
| "loss": 0.0022, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 0.09337899543378995, | |
| "grad_norm": 0.029535695910453796, | |
| "learning_rate": 0.001, | |
| "loss": 0.0021, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 0.09360730593607305, | |
| "grad_norm": 0.022993121296167374, | |
| "learning_rate": 0.001, | |
| "loss": 0.0024, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.09383561643835617, | |
| "grad_norm": 0.018401680514216423, | |
| "learning_rate": 0.001, | |
| "loss": 0.0014, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 0.09406392694063927, | |
| "grad_norm": 0.018391454592347145, | |
| "learning_rate": 0.001, | |
| "loss": 0.0015, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 0.09429223744292238, | |
| "grad_norm": 0.03675055503845215, | |
| "learning_rate": 0.001, | |
| "loss": 0.004, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 0.09452054794520548, | |
| "grad_norm": 0.026887210085988045, | |
| "learning_rate": 0.001, | |
| "loss": 0.0014, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 0.09474885844748858, | |
| "grad_norm": 0.02171693742275238, | |
| "learning_rate": 0.001, | |
| "loss": 0.0017, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.09497716894977169, | |
| "grad_norm": 0.036046102643013, | |
| "learning_rate": 0.001, | |
| "loss": 0.0031, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 0.09520547945205479, | |
| "grad_norm": 0.02878933772444725, | |
| "learning_rate": 0.001, | |
| "loss": 0.0021, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 0.0954337899543379, | |
| "grad_norm": 0.017262322828173637, | |
| "learning_rate": 0.001, | |
| "loss": 0.0022, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 0.095662100456621, | |
| "grad_norm": 0.028725091367959976, | |
| "learning_rate": 0.001, | |
| "loss": 0.0029, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 0.0958904109589041, | |
| "grad_norm": 0.03320247679948807, | |
| "learning_rate": 0.001, | |
| "loss": 0.0023, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.09611872146118722, | |
| "grad_norm": 0.025160877034068108, | |
| "learning_rate": 0.001, | |
| "loss": 0.0026, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 0.09634703196347032, | |
| "grad_norm": 0.023186132311820984, | |
| "learning_rate": 0.001, | |
| "loss": 0.0023, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 0.09657534246575343, | |
| "grad_norm": 0.03161732107400894, | |
| "learning_rate": 0.001, | |
| "loss": 0.0036, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 0.09680365296803653, | |
| "grad_norm": 0.023892000317573547, | |
| "learning_rate": 0.001, | |
| "loss": 0.003, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 0.09703196347031963, | |
| "grad_norm": 0.04748233035206795, | |
| "learning_rate": 0.001, | |
| "loss": 0.0028, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.09726027397260274, | |
| "grad_norm": 0.018185172230005264, | |
| "learning_rate": 0.001, | |
| "loss": 0.0016, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 0.09748858447488584, | |
| "grad_norm": 0.024023696780204773, | |
| "learning_rate": 0.001, | |
| "loss": 0.0027, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 0.09771689497716896, | |
| "grad_norm": 0.019455142319202423, | |
| "learning_rate": 0.001, | |
| "loss": 0.0026, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 0.09794520547945205, | |
| "grad_norm": 0.02732614241540432, | |
| "learning_rate": 0.001, | |
| "loss": 0.0026, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 0.09817351598173515, | |
| "grad_norm": 0.017890289425849915, | |
| "learning_rate": 0.001, | |
| "loss": 0.0014, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.09840182648401827, | |
| "grad_norm": 0.028596822172403336, | |
| "learning_rate": 0.001, | |
| "loss": 0.0022, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 0.09863013698630137, | |
| "grad_norm": 0.03205295652151108, | |
| "learning_rate": 0.001, | |
| "loss": 0.0027, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 0.09885844748858448, | |
| "grad_norm": 0.03697388991713524, | |
| "learning_rate": 0.001, | |
| "loss": 0.0024, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 0.09908675799086758, | |
| "grad_norm": 0.03635745123028755, | |
| "learning_rate": 0.001, | |
| "loss": 0.0031, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 0.09931506849315068, | |
| "grad_norm": 0.023816758766770363, | |
| "learning_rate": 0.001, | |
| "loss": 0.0025, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.09954337899543379, | |
| "grad_norm": 0.019579321146011353, | |
| "learning_rate": 0.001, | |
| "loss": 0.0021, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 0.09977168949771689, | |
| "grad_norm": 0.023318948224186897, | |
| "learning_rate": 0.001, | |
| "loss": 0.0021, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.022768663242459297, | |
| "learning_rate": 0.001, | |
| "loss": 0.0021, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 0.1002283105022831, | |
| "grad_norm": 0.015700766816735268, | |
| "learning_rate": 0.001, | |
| "loss": 0.0014, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 0.1004566210045662, | |
| "grad_norm": 0.01778263971209526, | |
| "learning_rate": 0.001, | |
| "loss": 0.0017, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.10068493150684932, | |
| "grad_norm": 0.028968170285224915, | |
| "learning_rate": 0.001, | |
| "loss": 0.0016, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 0.10091324200913242, | |
| "grad_norm": 0.01981866918504238, | |
| "learning_rate": 0.001, | |
| "loss": 0.0012, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 0.10114155251141553, | |
| "grad_norm": 0.022714197635650635, | |
| "learning_rate": 0.001, | |
| "loss": 0.0017, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 0.10136986301369863, | |
| "grad_norm": 0.024588901549577713, | |
| "learning_rate": 0.001, | |
| "loss": 0.0017, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 0.10159817351598173, | |
| "grad_norm": 0.02210937812924385, | |
| "learning_rate": 0.001, | |
| "loss": 0.0016, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.10182648401826484, | |
| "grad_norm": 0.015890007838606834, | |
| "learning_rate": 0.001, | |
| "loss": 0.001, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 0.10205479452054794, | |
| "grad_norm": 0.02576160989701748, | |
| "learning_rate": 0.001, | |
| "loss": 0.0022, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 0.10228310502283106, | |
| "grad_norm": 0.025480084121227264, | |
| "learning_rate": 0.001, | |
| "loss": 0.0019, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 0.10251141552511416, | |
| "grad_norm": 0.020510738715529442, | |
| "learning_rate": 0.001, | |
| "loss": 0.0017, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 0.10273972602739725, | |
| "grad_norm": 0.026737291365861893, | |
| "learning_rate": 0.001, | |
| "loss": 0.0023, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.10296803652968037, | |
| "grad_norm": 0.03111446276307106, | |
| "learning_rate": 0.001, | |
| "loss": 0.0021, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 0.10319634703196347, | |
| "grad_norm": 0.029617153108119965, | |
| "learning_rate": 0.001, | |
| "loss": 0.0028, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 0.10342465753424658, | |
| "grad_norm": 0.033933065831661224, | |
| "learning_rate": 0.001, | |
| "loss": 0.0026, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 0.10365296803652968, | |
| "grad_norm": 0.029769249260425568, | |
| "learning_rate": 0.001, | |
| "loss": 0.0026, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 0.10388127853881278, | |
| "grad_norm": 0.029685623943805695, | |
| "learning_rate": 0.001, | |
| "loss": 0.0018, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.10410958904109589, | |
| "grad_norm": 0.03061087615787983, | |
| "learning_rate": 0.001, | |
| "loss": 0.0034, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 0.10433789954337899, | |
| "grad_norm": 0.02060793712735176, | |
| "learning_rate": 0.001, | |
| "loss": 0.0027, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 0.1045662100456621, | |
| "grad_norm": 0.02304467186331749, | |
| "learning_rate": 0.001, | |
| "loss": 0.0023, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 0.1047945205479452, | |
| "grad_norm": 0.0261305570602417, | |
| "learning_rate": 0.001, | |
| "loss": 0.0026, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 0.1050228310502283, | |
| "grad_norm": 0.023978248238563538, | |
| "learning_rate": 0.001, | |
| "loss": 0.0024, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.10525114155251142, | |
| "grad_norm": 0.02428649179637432, | |
| "learning_rate": 0.001, | |
| "loss": 0.0016, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 0.10547945205479452, | |
| "grad_norm": 0.0215776227414608, | |
| "learning_rate": 0.001, | |
| "loss": 0.0022, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 0.10570776255707763, | |
| "grad_norm": 0.020924601703882217, | |
| "learning_rate": 0.001, | |
| "loss": 0.0018, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 0.10593607305936073, | |
| "grad_norm": 0.020037012174725533, | |
| "learning_rate": 0.001, | |
| "loss": 0.0031, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 0.10616438356164383, | |
| "grad_norm": 0.021177353337407112, | |
| "learning_rate": 0.001, | |
| "loss": 0.0025, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.10639269406392694, | |
| "grad_norm": 0.021240398287773132, | |
| "learning_rate": 0.001, | |
| "loss": 0.0019, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 0.10662100456621004, | |
| "grad_norm": 0.022526200860738754, | |
| "learning_rate": 0.001, | |
| "loss": 0.0017, | |
| "step": 467 | |
| }, | |
| { | |
| "epoch": 0.10684931506849316, | |
| "grad_norm": 0.02899310737848282, | |
| "learning_rate": 0.001, | |
| "loss": 0.0024, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 0.10707762557077626, | |
| "grad_norm": 0.021294210106134415, | |
| "learning_rate": 0.001, | |
| "loss": 0.0025, | |
| "step": 469 | |
| }, | |
| { | |
| "epoch": 0.10730593607305935, | |
| "grad_norm": 0.019539158791303635, | |
| "learning_rate": 0.001, | |
| "loss": 0.0016, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.10753424657534247, | |
| "grad_norm": 0.03813247010111809, | |
| "learning_rate": 0.001, | |
| "loss": 0.0023, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 0.10776255707762557, | |
| "grad_norm": 0.027778642252087593, | |
| "learning_rate": 0.001, | |
| "loss": 0.0031, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 0.10799086757990868, | |
| "grad_norm": 0.023844033479690552, | |
| "learning_rate": 0.001, | |
| "loss": 0.0024, | |
| "step": 473 | |
| }, | |
| { | |
| "epoch": 0.10821917808219178, | |
| "grad_norm": 0.023807501420378685, | |
| "learning_rate": 0.001, | |
| "loss": 0.0031, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 0.10844748858447488, | |
| "grad_norm": 0.023057186976075172, | |
| "learning_rate": 0.001, | |
| "loss": 0.0022, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.108675799086758, | |
| "grad_norm": 0.018374644219875336, | |
| "learning_rate": 0.001, | |
| "loss": 0.0018, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 0.10890410958904109, | |
| "grad_norm": 0.022881170734763145, | |
| "learning_rate": 0.001, | |
| "loss": 0.0016, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 0.1091324200913242, | |
| "grad_norm": 0.017999105155467987, | |
| "learning_rate": 0.001, | |
| "loss": 0.0019, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 0.1093607305936073, | |
| "grad_norm": 0.026413699612021446, | |
| "learning_rate": 0.001, | |
| "loss": 0.0035, | |
| "step": 479 | |
| }, | |
| { | |
| "epoch": 0.1095890410958904, | |
| "grad_norm": 0.026815691962838173, | |
| "learning_rate": 0.001, | |
| "loss": 0.0031, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.10981735159817352, | |
| "grad_norm": 0.01882576383650303, | |
| "learning_rate": 0.001, | |
| "loss": 0.0018, | |
| "step": 481 | |
| }, | |
| { | |
| "epoch": 0.11004566210045662, | |
| "grad_norm": 0.022626416757702827, | |
| "learning_rate": 0.001, | |
| "loss": 0.0016, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 0.11027397260273973, | |
| "grad_norm": 0.0262600127607584, | |
| "learning_rate": 0.001, | |
| "loss": 0.0021, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 0.11050228310502283, | |
| "grad_norm": 0.017802784219384193, | |
| "learning_rate": 0.001, | |
| "loss": 0.0018, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 0.11073059360730593, | |
| "grad_norm": 0.017433062195777893, | |
| "learning_rate": 0.001, | |
| "loss": 0.0011, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.11095890410958904, | |
| "grad_norm": 0.023387275636196136, | |
| "learning_rate": 0.001, | |
| "loss": 0.0027, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 0.11118721461187214, | |
| "grad_norm": 0.021118011325597763, | |
| "learning_rate": 0.001, | |
| "loss": 0.0022, | |
| "step": 487 | |
| }, | |
| { | |
| "epoch": 0.11141552511415526, | |
| "grad_norm": 0.01577088050544262, | |
| "learning_rate": 0.001, | |
| "loss": 0.0017, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 0.11164383561643836, | |
| "grad_norm": 0.020268132910132408, | |
| "learning_rate": 0.001, | |
| "loss": 0.0013, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 0.11187214611872145, | |
| "grad_norm": 0.01911369152367115, | |
| "learning_rate": 0.001, | |
| "loss": 0.0018, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.11210045662100457, | |
| "grad_norm": 0.02497555874288082, | |
| "learning_rate": 0.001, | |
| "loss": 0.0024, | |
| "step": 491 | |
| }, | |
| { | |
| "epoch": 0.11232876712328767, | |
| "grad_norm": 0.02308499813079834, | |
| "learning_rate": 0.001, | |
| "loss": 0.0015, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 0.11255707762557078, | |
| "grad_norm": 0.01704687625169754, | |
| "learning_rate": 0.001, | |
| "loss": 0.0015, | |
| "step": 493 | |
| }, | |
| { | |
| "epoch": 0.11278538812785388, | |
| "grad_norm": 0.01520821824669838, | |
| "learning_rate": 0.001, | |
| "loss": 0.0012, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 0.11301369863013698, | |
| "grad_norm": 0.021169276908040047, | |
| "learning_rate": 0.001, | |
| "loss": 0.0016, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.1132420091324201, | |
| "grad_norm": 0.02852361463010311, | |
| "learning_rate": 0.001, | |
| "loss": 0.0019, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 0.11347031963470319, | |
| "grad_norm": 0.02134719118475914, | |
| "learning_rate": 0.001, | |
| "loss": 0.0021, | |
| "step": 497 | |
| }, | |
| { | |
| "epoch": 0.1136986301369863, | |
| "grad_norm": 0.02251187339425087, | |
| "learning_rate": 0.001, | |
| "loss": 0.0023, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 0.1139269406392694, | |
| "grad_norm": 0.01491115614771843, | |
| "learning_rate": 0.001, | |
| "loss": 0.0011, | |
| "step": 499 | |
| }, | |
| { | |
| "epoch": 0.1141552511415525, | |
| "grad_norm": 0.02773105911910534, | |
| "learning_rate": 0.001, | |
| "loss": 0.0018, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.1141552511415525, | |
| "step": 500, | |
| "total_flos": 1.2027275771904e+17, | |
| "train_loss": 0.030520909884246068, | |
| "train_runtime": 3128.3367, | |
| "train_samples_per_second": 3.197, | |
| "train_steps_per_second": 0.16 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 500, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.2027275771904e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |