|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.03949447077409163, |
|
"eval_steps": 500, |
|
"global_step": 200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00019747235387045813, |
|
"grad_norm": 5.788590908050537, |
|
"learning_rate": 0.0, |
|
"loss": 1.0436, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.00039494470774091627, |
|
"grad_norm": 6.300467491149902, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 1.3195, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0005924170616113745, |
|
"grad_norm": 6.489816665649414, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 1.5645, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0007898894154818325, |
|
"grad_norm": 5.287850379943848, |
|
"learning_rate": 3e-06, |
|
"loss": 1.154, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0009873617693522906, |
|
"grad_norm": 6.123354434967041, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 1.4052, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.001184834123222749, |
|
"grad_norm": 5.501026630401611, |
|
"learning_rate": 5e-06, |
|
"loss": 1.708, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.001382306477093207, |
|
"grad_norm": 4.960855007171631, |
|
"learning_rate": 6e-06, |
|
"loss": 1.6143, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.001579778830963665, |
|
"grad_norm": 5.855900764465332, |
|
"learning_rate": 7e-06, |
|
"loss": 1.6376, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0017772511848341231, |
|
"grad_norm": 5.17061710357666, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 1.2182, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.0019747235387045812, |
|
"grad_norm": 4.436169624328613, |
|
"learning_rate": 9e-06, |
|
"loss": 1.9812, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0021721958925750395, |
|
"grad_norm": 4.55659818649292, |
|
"learning_rate": 1e-05, |
|
"loss": 1.1695, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.002369668246445498, |
|
"grad_norm": 5.105171203613281, |
|
"learning_rate": 9.947368421052632e-06, |
|
"loss": 1.1243, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0025671406003159557, |
|
"grad_norm": 5.100945472717285, |
|
"learning_rate": 9.894736842105264e-06, |
|
"loss": 1.8183, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.002764612954186414, |
|
"grad_norm": 4.643738746643066, |
|
"learning_rate": 9.842105263157896e-06, |
|
"loss": 1.2569, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.002962085308056872, |
|
"grad_norm": 4.557013034820557, |
|
"learning_rate": 9.789473684210527e-06, |
|
"loss": 1.7372, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.00315955766192733, |
|
"grad_norm": 6.458473205566406, |
|
"learning_rate": 9.736842105263159e-06, |
|
"loss": 2.1888, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.0033570300157977884, |
|
"grad_norm": 5.135770320892334, |
|
"learning_rate": 9.68421052631579e-06, |
|
"loss": 1.2197, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.0035545023696682463, |
|
"grad_norm": 5.171187400817871, |
|
"learning_rate": 9.631578947368422e-06, |
|
"loss": 1.6326, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.0037519747235387046, |
|
"grad_norm": 4.825802326202393, |
|
"learning_rate": 9.578947368421054e-06, |
|
"loss": 1.4698, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.0039494470774091624, |
|
"grad_norm": 4.34477424621582, |
|
"learning_rate": 9.526315789473684e-06, |
|
"loss": 1.6102, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.004146919431279621, |
|
"grad_norm": 4.5733866691589355, |
|
"learning_rate": 9.473684210526315e-06, |
|
"loss": 1.8448, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.004344391785150079, |
|
"grad_norm": 4.53968620300293, |
|
"learning_rate": 9.421052631578949e-06, |
|
"loss": 1.2972, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.004541864139020537, |
|
"grad_norm": 5.408173084259033, |
|
"learning_rate": 9.36842105263158e-06, |
|
"loss": 1.7215, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.004739336492890996, |
|
"grad_norm": 4.2717461585998535, |
|
"learning_rate": 9.315789473684212e-06, |
|
"loss": 1.137, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.0049368088467614535, |
|
"grad_norm": 4.3075785636901855, |
|
"learning_rate": 9.263157894736842e-06, |
|
"loss": 1.3048, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.005134281200631911, |
|
"grad_norm": 4.659534454345703, |
|
"learning_rate": 9.210526315789474e-06, |
|
"loss": 0.9883, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.00533175355450237, |
|
"grad_norm": 4.719169616699219, |
|
"learning_rate": 9.157894736842105e-06, |
|
"loss": 1.4926, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.005529225908372828, |
|
"grad_norm": 4.502306938171387, |
|
"learning_rate": 9.105263157894739e-06, |
|
"loss": 1.8264, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.005726698262243286, |
|
"grad_norm": 4.353489875793457, |
|
"learning_rate": 9.05263157894737e-06, |
|
"loss": 1.6681, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.005924170616113744, |
|
"grad_norm": 5.161799907684326, |
|
"learning_rate": 9e-06, |
|
"loss": 1.6566, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.006121642969984202, |
|
"grad_norm": 4.235696315765381, |
|
"learning_rate": 8.947368421052632e-06, |
|
"loss": 1.7276, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.00631911532385466, |
|
"grad_norm": 6.545216083526611, |
|
"learning_rate": 8.894736842105264e-06, |
|
"loss": 1.0481, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.006516587677725118, |
|
"grad_norm": 4.9834113121032715, |
|
"learning_rate": 8.842105263157895e-06, |
|
"loss": 1.4457, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.006714060031595577, |
|
"grad_norm": 4.448666572570801, |
|
"learning_rate": 8.789473684210527e-06, |
|
"loss": 1.7361, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.006911532385466035, |
|
"grad_norm": 4.735658168792725, |
|
"learning_rate": 8.736842105263158e-06, |
|
"loss": 1.3361, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.0071090047393364926, |
|
"grad_norm": 5.857210636138916, |
|
"learning_rate": 8.68421052631579e-06, |
|
"loss": 1.4395, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.007306477093206951, |
|
"grad_norm": 4.746231555938721, |
|
"learning_rate": 8.631578947368422e-06, |
|
"loss": 1.4588, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.007503949447077409, |
|
"grad_norm": 5.420529365539551, |
|
"learning_rate": 8.578947368421053e-06, |
|
"loss": 1.4352, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.007701421800947867, |
|
"grad_norm": 3.966956853866577, |
|
"learning_rate": 8.526315789473685e-06, |
|
"loss": 1.7395, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.007898894154818325, |
|
"grad_norm": 4.626506805419922, |
|
"learning_rate": 8.473684210526317e-06, |
|
"loss": 1.2152, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.008096366508688783, |
|
"grad_norm": 5.946537971496582, |
|
"learning_rate": 8.421052631578948e-06, |
|
"loss": 1.2569, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.008293838862559242, |
|
"grad_norm": 6.078729152679443, |
|
"learning_rate": 8.36842105263158e-06, |
|
"loss": 2.0072, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.0084913112164297, |
|
"grad_norm": 5.362630844116211, |
|
"learning_rate": 8.315789473684212e-06, |
|
"loss": 1.6189, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.008688783570300158, |
|
"grad_norm": 4.6319098472595215, |
|
"learning_rate": 8.263157894736843e-06, |
|
"loss": 1.1937, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.008886255924170616, |
|
"grad_norm": 5.145988464355469, |
|
"learning_rate": 8.210526315789475e-06, |
|
"loss": 1.48, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.009083728278041074, |
|
"grad_norm": 5.191286563873291, |
|
"learning_rate": 8.157894736842106e-06, |
|
"loss": 1.6115, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.009281200631911532, |
|
"grad_norm": 4.6075544357299805, |
|
"learning_rate": 8.105263157894736e-06, |
|
"loss": 1.3963, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.009478672985781991, |
|
"grad_norm": 4.724617958068848, |
|
"learning_rate": 8.052631578947368e-06, |
|
"loss": 1.7869, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.009676145339652449, |
|
"grad_norm": 4.976570129394531, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 1.5635, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.009873617693522907, |
|
"grad_norm": 5.121829032897949, |
|
"learning_rate": 7.947368421052633e-06, |
|
"loss": 1.246, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.010071090047393365, |
|
"grad_norm": 5.260928153991699, |
|
"learning_rate": 7.894736842105265e-06, |
|
"loss": 1.98, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.010268562401263823, |
|
"grad_norm": 5.072506904602051, |
|
"learning_rate": 7.842105263157895e-06, |
|
"loss": 1.0555, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.01046603475513428, |
|
"grad_norm": 4.6401872634887695, |
|
"learning_rate": 7.789473684210526e-06, |
|
"loss": 1.5456, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.01066350710900474, |
|
"grad_norm": 5.566153526306152, |
|
"learning_rate": 7.736842105263158e-06, |
|
"loss": 1.7965, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.010860979462875198, |
|
"grad_norm": 5.0998215675354, |
|
"learning_rate": 7.68421052631579e-06, |
|
"loss": 1.3262, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.011058451816745656, |
|
"grad_norm": 4.437518119812012, |
|
"learning_rate": 7.631578947368423e-06, |
|
"loss": 1.7174, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.011255924170616114, |
|
"grad_norm": 3.8838698863983154, |
|
"learning_rate": 7.578947368421054e-06, |
|
"loss": 1.185, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.011453396524486572, |
|
"grad_norm": 4.112951278686523, |
|
"learning_rate": 7.526315789473685e-06, |
|
"loss": 1.3785, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.01165086887835703, |
|
"grad_norm": 4.612501621246338, |
|
"learning_rate": 7.473684210526316e-06, |
|
"loss": 1.2861, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.011848341232227487, |
|
"grad_norm": 4.541945457458496, |
|
"learning_rate": 7.421052631578948e-06, |
|
"loss": 1.0096, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.012045813586097947, |
|
"grad_norm": 6.567755699157715, |
|
"learning_rate": 7.368421052631579e-06, |
|
"loss": 1.0163, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.012243285939968405, |
|
"grad_norm": 4.897511959075928, |
|
"learning_rate": 7.315789473684212e-06, |
|
"loss": 1.8322, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.012440758293838863, |
|
"grad_norm": 4.3046064376831055, |
|
"learning_rate": 7.263157894736843e-06, |
|
"loss": 1.0891, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.01263823064770932, |
|
"grad_norm": 4.533966064453125, |
|
"learning_rate": 7.210526315789474e-06, |
|
"loss": 1.3371, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.012835703001579778, |
|
"grad_norm": 4.470656394958496, |
|
"learning_rate": 7.157894736842106e-06, |
|
"loss": 2.3414, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.013033175355450236, |
|
"grad_norm": 4.738101959228516, |
|
"learning_rate": 7.1052631578947375e-06, |
|
"loss": 1.765, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.013230647709320696, |
|
"grad_norm": 3.870649814605713, |
|
"learning_rate": 7.052631578947369e-06, |
|
"loss": 1.6424, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.013428120063191154, |
|
"grad_norm": 7.043670177459717, |
|
"learning_rate": 7e-06, |
|
"loss": 1.5049, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.013625592417061612, |
|
"grad_norm": 4.199113368988037, |
|
"learning_rate": 6.947368421052632e-06, |
|
"loss": 1.5253, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.01382306477093207, |
|
"grad_norm": 4.738468170166016, |
|
"learning_rate": 6.894736842105264e-06, |
|
"loss": 1.4544, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.014020537124802527, |
|
"grad_norm": 5.083221912384033, |
|
"learning_rate": 6.842105263157896e-06, |
|
"loss": 1.4874, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.014218009478672985, |
|
"grad_norm": 4.9555253982543945, |
|
"learning_rate": 6.789473684210527e-06, |
|
"loss": 1.7049, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.014415481832543445, |
|
"grad_norm": 4.266180992126465, |
|
"learning_rate": 6.736842105263158e-06, |
|
"loss": 1.2864, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.014612954186413903, |
|
"grad_norm": 4.510780334472656, |
|
"learning_rate": 6.68421052631579e-06, |
|
"loss": 1.4615, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.01481042654028436, |
|
"grad_norm": 4.075244426727295, |
|
"learning_rate": 6.631578947368421e-06, |
|
"loss": 1.0669, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.015007898894154818, |
|
"grad_norm": 4.169254302978516, |
|
"learning_rate": 6.578947368421054e-06, |
|
"loss": 1.4458, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.015205371248025276, |
|
"grad_norm": 4.540365695953369, |
|
"learning_rate": 6.526315789473685e-06, |
|
"loss": 1.4148, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.015402843601895734, |
|
"grad_norm": 4.700695037841797, |
|
"learning_rate": 6.473684210526316e-06, |
|
"loss": 2.0347, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.015600315955766192, |
|
"grad_norm": 4.982248306274414, |
|
"learning_rate": 6.421052631578948e-06, |
|
"loss": 1.3731, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.01579778830963665, |
|
"grad_norm": 5.1645941734313965, |
|
"learning_rate": 6.3684210526315795e-06, |
|
"loss": 1.1041, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.015995260663507108, |
|
"grad_norm": 3.988223075866699, |
|
"learning_rate": 6.31578947368421e-06, |
|
"loss": 1.8261, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.016192733017377565, |
|
"grad_norm": 5.132425785064697, |
|
"learning_rate": 6.263157894736842e-06, |
|
"loss": 1.3754, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.016390205371248027, |
|
"grad_norm": 4.158195972442627, |
|
"learning_rate": 6.2105263157894745e-06, |
|
"loss": 1.233, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.016587677725118485, |
|
"grad_norm": 5.385928630828857, |
|
"learning_rate": 6.157894736842106e-06, |
|
"loss": 1.6838, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.016785150078988943, |
|
"grad_norm": 4.63645076751709, |
|
"learning_rate": 6.105263157894738e-06, |
|
"loss": 1.9314, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.0169826224328594, |
|
"grad_norm": 5.3244757652282715, |
|
"learning_rate": 6.0526315789473685e-06, |
|
"loss": 1.4223, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.017180094786729858, |
|
"grad_norm": 4.603305339813232, |
|
"learning_rate": 6e-06, |
|
"loss": 1.7243, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.017377567140600316, |
|
"grad_norm": 5.223212718963623, |
|
"learning_rate": 5.947368421052632e-06, |
|
"loss": 1.337, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.017575039494470774, |
|
"grad_norm": 4.344864845275879, |
|
"learning_rate": 5.8947368421052634e-06, |
|
"loss": 1.4767, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.017772511848341232, |
|
"grad_norm": 4.268956184387207, |
|
"learning_rate": 5.842105263157896e-06, |
|
"loss": 1.0763, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.01796998420221169, |
|
"grad_norm": 4.882099151611328, |
|
"learning_rate": 5.789473684210527e-06, |
|
"loss": 1.7006, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.018167456556082148, |
|
"grad_norm": 5.254153728485107, |
|
"learning_rate": 5.736842105263158e-06, |
|
"loss": 2.0318, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.018364928909952605, |
|
"grad_norm": 4.496589660644531, |
|
"learning_rate": 5.68421052631579e-06, |
|
"loss": 1.0075, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.018562401263823063, |
|
"grad_norm": 4.610377788543701, |
|
"learning_rate": 5.631578947368422e-06, |
|
"loss": 0.973, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.018759873617693525, |
|
"grad_norm": 4.102587699890137, |
|
"learning_rate": 5.578947368421052e-06, |
|
"loss": 1.4508, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.018957345971563982, |
|
"grad_norm": 4.820801258087158, |
|
"learning_rate": 5.526315789473685e-06, |
|
"loss": 2.234, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.01915481832543444, |
|
"grad_norm": 4.17614221572876, |
|
"learning_rate": 5.4736842105263165e-06, |
|
"loss": 0.9472, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.019352290679304898, |
|
"grad_norm": 4.681643962860107, |
|
"learning_rate": 5.421052631578948e-06, |
|
"loss": 0.952, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.019549763033175356, |
|
"grad_norm": 4.793570041656494, |
|
"learning_rate": 5.36842105263158e-06, |
|
"loss": 2.1524, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.019747235387045814, |
|
"grad_norm": 5.580649375915527, |
|
"learning_rate": 5.315789473684211e-06, |
|
"loss": 1.168, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.019944707740916272, |
|
"grad_norm": 4.42297887802124, |
|
"learning_rate": 5.263157894736842e-06, |
|
"loss": 1.0549, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.02014218009478673, |
|
"grad_norm": 4.290710926055908, |
|
"learning_rate": 5.210526315789474e-06, |
|
"loss": 1.4328, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.020339652448657188, |
|
"grad_norm": 4.0357255935668945, |
|
"learning_rate": 5.157894736842106e-06, |
|
"loss": 0.9894, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.020537124802527645, |
|
"grad_norm": 4.805973529815674, |
|
"learning_rate": 5.105263157894738e-06, |
|
"loss": 1.3093, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.020734597156398103, |
|
"grad_norm": 4.185112953186035, |
|
"learning_rate": 5.052631578947369e-06, |
|
"loss": 1.4606, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.02093206951026856, |
|
"grad_norm": 4.384559631347656, |
|
"learning_rate": 5e-06, |
|
"loss": 1.0051, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.02112954186413902, |
|
"grad_norm": 4.920189380645752, |
|
"learning_rate": 4.947368421052632e-06, |
|
"loss": 1.1721, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.02132701421800948, |
|
"grad_norm": 4.506773471832275, |
|
"learning_rate": 4.894736842105264e-06, |
|
"loss": 1.4764, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.021524486571879938, |
|
"grad_norm": 4.980959415435791, |
|
"learning_rate": 4.842105263157895e-06, |
|
"loss": 1.1304, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.021721958925750396, |
|
"grad_norm": 4.118868827819824, |
|
"learning_rate": 4.789473684210527e-06, |
|
"loss": 1.1892, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.021919431279620854, |
|
"grad_norm": 6.19287109375, |
|
"learning_rate": 4.736842105263158e-06, |
|
"loss": 1.6254, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.022116903633491312, |
|
"grad_norm": 5.811559200286865, |
|
"learning_rate": 4.68421052631579e-06, |
|
"loss": 1.8662, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.02231437598736177, |
|
"grad_norm": 4.6513352394104, |
|
"learning_rate": 4.631578947368421e-06, |
|
"loss": 1.3816, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.022511848341232227, |
|
"grad_norm": 5.178617477416992, |
|
"learning_rate": 4.578947368421053e-06, |
|
"loss": 1.2535, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.022709320695102685, |
|
"grad_norm": 3.830137014389038, |
|
"learning_rate": 4.526315789473685e-06, |
|
"loss": 1.5312, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.022906793048973143, |
|
"grad_norm": 4.620641231536865, |
|
"learning_rate": 4.473684210526316e-06, |
|
"loss": 1.5528, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.0231042654028436, |
|
"grad_norm": 5.0326738357543945, |
|
"learning_rate": 4.4210526315789476e-06, |
|
"loss": 1.3443, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.02330173775671406, |
|
"grad_norm": 4.62188720703125, |
|
"learning_rate": 4.368421052631579e-06, |
|
"loss": 1.0445, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.023499210110584517, |
|
"grad_norm": 3.9635679721832275, |
|
"learning_rate": 4.315789473684211e-06, |
|
"loss": 1.5812, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.023696682464454975, |
|
"grad_norm": 5.0226664543151855, |
|
"learning_rate": 4.2631578947368425e-06, |
|
"loss": 1.2235, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.023894154818325436, |
|
"grad_norm": 5.353757858276367, |
|
"learning_rate": 4.210526315789474e-06, |
|
"loss": 1.5831, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.024091627172195894, |
|
"grad_norm": 4.838202476501465, |
|
"learning_rate": 4.157894736842106e-06, |
|
"loss": 1.7689, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.02428909952606635, |
|
"grad_norm": 4.49991512298584, |
|
"learning_rate": 4.105263157894737e-06, |
|
"loss": 0.828, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.02448657187993681, |
|
"grad_norm": 4.480604648590088, |
|
"learning_rate": 4.052631578947368e-06, |
|
"loss": 1.6869, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.024684044233807267, |
|
"grad_norm": 4.725250244140625, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.7038, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.024881516587677725, |
|
"grad_norm": 4.806907653808594, |
|
"learning_rate": 3.947368421052632e-06, |
|
"loss": 1.6985, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.025078988941548183, |
|
"grad_norm": 4.734091758728027, |
|
"learning_rate": 3.894736842105263e-06, |
|
"loss": 1.5138, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.02527646129541864, |
|
"grad_norm": 5.3406243324279785, |
|
"learning_rate": 3.842105263157895e-06, |
|
"loss": 1.7152, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.0254739336492891, |
|
"grad_norm": 4.890450477600098, |
|
"learning_rate": 3.789473684210527e-06, |
|
"loss": 1.734, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.025671406003159557, |
|
"grad_norm": 5.458994388580322, |
|
"learning_rate": 3.736842105263158e-06, |
|
"loss": 1.5888, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.025868878357030015, |
|
"grad_norm": 4.655605316162109, |
|
"learning_rate": 3.6842105263157896e-06, |
|
"loss": 1.7478, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.026066350710900472, |
|
"grad_norm": 7.664575576782227, |
|
"learning_rate": 3.6315789473684217e-06, |
|
"loss": 1.5616, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.026263823064770934, |
|
"grad_norm": 5.207353115081787, |
|
"learning_rate": 3.578947368421053e-06, |
|
"loss": 1.8918, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.02646129541864139, |
|
"grad_norm": 3.969021797180176, |
|
"learning_rate": 3.5263157894736846e-06, |
|
"loss": 1.0807, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.02665876777251185, |
|
"grad_norm": 5.148044586181641, |
|
"learning_rate": 3.473684210526316e-06, |
|
"loss": 1.5419, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.026856240126382307, |
|
"grad_norm": 5.609622955322266, |
|
"learning_rate": 3.421052631578948e-06, |
|
"loss": 1.2194, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.027053712480252765, |
|
"grad_norm": 4.281411170959473, |
|
"learning_rate": 3.368421052631579e-06, |
|
"loss": 0.9111, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.027251184834123223, |
|
"grad_norm": 4.415678977966309, |
|
"learning_rate": 3.3157894736842107e-06, |
|
"loss": 1.999, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.02744865718799368, |
|
"grad_norm": 4.46459436416626, |
|
"learning_rate": 3.2631578947368423e-06, |
|
"loss": 1.749, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.02764612954186414, |
|
"grad_norm": 4.180970191955566, |
|
"learning_rate": 3.210526315789474e-06, |
|
"loss": 1.4696, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.027843601895734597, |
|
"grad_norm": 4.308414459228516, |
|
"learning_rate": 3.157894736842105e-06, |
|
"loss": 1.6192, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.028041074249605055, |
|
"grad_norm": 6.27396821975708, |
|
"learning_rate": 3.1052631578947372e-06, |
|
"loss": 0.9774, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.028238546603475512, |
|
"grad_norm": 5.713791847229004, |
|
"learning_rate": 3.052631578947369e-06, |
|
"loss": 1.2108, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.02843601895734597, |
|
"grad_norm": 3.94455885887146, |
|
"learning_rate": 3e-06, |
|
"loss": 0.928, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.028633491311216428, |
|
"grad_norm": 3.577357769012451, |
|
"learning_rate": 2.9473684210526317e-06, |
|
"loss": 1.2109, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.02883096366508689, |
|
"grad_norm": 4.142393112182617, |
|
"learning_rate": 2.8947368421052634e-06, |
|
"loss": 0.8309, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.029028436018957347, |
|
"grad_norm": 5.459789276123047, |
|
"learning_rate": 2.842105263157895e-06, |
|
"loss": 1.6238, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.029225908372827805, |
|
"grad_norm": 4.358528137207031, |
|
"learning_rate": 2.789473684210526e-06, |
|
"loss": 1.2229, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.029423380726698263, |
|
"grad_norm": 4.201858043670654, |
|
"learning_rate": 2.7368421052631583e-06, |
|
"loss": 0.8191, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.02962085308056872, |
|
"grad_norm": 5.12843656539917, |
|
"learning_rate": 2.68421052631579e-06, |
|
"loss": 2.0575, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.02981832543443918, |
|
"grad_norm": 3.9480583667755127, |
|
"learning_rate": 2.631578947368421e-06, |
|
"loss": 1.2752, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.030015797788309637, |
|
"grad_norm": 4.900203227996826, |
|
"learning_rate": 2.578947368421053e-06, |
|
"loss": 0.6156, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.030213270142180094, |
|
"grad_norm": 5.017102241516113, |
|
"learning_rate": 2.5263157894736844e-06, |
|
"loss": 1.4791, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.030410742496050552, |
|
"grad_norm": 4.326578140258789, |
|
"learning_rate": 2.473684210526316e-06, |
|
"loss": 1.1194, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.03060821484992101, |
|
"grad_norm": 4.385910511016846, |
|
"learning_rate": 2.4210526315789477e-06, |
|
"loss": 1.8029, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.030805687203791468, |
|
"grad_norm": 3.988187551498413, |
|
"learning_rate": 2.368421052631579e-06, |
|
"loss": 0.9693, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.031003159557661926, |
|
"grad_norm": 4.455026149749756, |
|
"learning_rate": 2.3157894736842105e-06, |
|
"loss": 1.3069, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.031200631911532384, |
|
"grad_norm": 5.547366142272949, |
|
"learning_rate": 2.2631578947368426e-06, |
|
"loss": 1.5911, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.03139810426540284, |
|
"grad_norm": 5.407074928283691, |
|
"learning_rate": 2.2105263157894738e-06, |
|
"loss": 1.4276, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.0315955766192733, |
|
"grad_norm": 4.332579135894775, |
|
"learning_rate": 2.1578947368421054e-06, |
|
"loss": 1.0162, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.03179304897314376, |
|
"grad_norm": 4.770085334777832, |
|
"learning_rate": 2.105263157894737e-06, |
|
"loss": 1.5811, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.031990521327014215, |
|
"grad_norm": 4.930882930755615, |
|
"learning_rate": 2.0526315789473687e-06, |
|
"loss": 1.1303, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.03218799368088467, |
|
"grad_norm": 5.203794479370117, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 1.135, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.03238546603475513, |
|
"grad_norm": 3.9070425033569336, |
|
"learning_rate": 1.9473684210526315e-06, |
|
"loss": 0.9977, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.032582938388625596, |
|
"grad_norm": 5.830733299255371, |
|
"learning_rate": 1.8947368421052634e-06, |
|
"loss": 1.8587, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.032780410742496054, |
|
"grad_norm": 4.05476713180542, |
|
"learning_rate": 1.8421052631578948e-06, |
|
"loss": 1.9495, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.03297788309636651, |
|
"grad_norm": 3.980226755142212, |
|
"learning_rate": 1.7894736842105265e-06, |
|
"loss": 1.6, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.03317535545023697, |
|
"grad_norm": 6.381178379058838, |
|
"learning_rate": 1.736842105263158e-06, |
|
"loss": 1.6905, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.03337282780410743, |
|
"grad_norm": 4.4184889793396, |
|
"learning_rate": 1.6842105263157895e-06, |
|
"loss": 1.4703, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.033570300157977885, |
|
"grad_norm": 4.5157470703125, |
|
"learning_rate": 1.6315789473684212e-06, |
|
"loss": 1.273, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.03376777251184834, |
|
"grad_norm": 4.454701900482178, |
|
"learning_rate": 1.5789473684210526e-06, |
|
"loss": 1.9122, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.0339652448657188, |
|
"grad_norm": 4.891290664672852, |
|
"learning_rate": 1.5263157894736844e-06, |
|
"loss": 1.514, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.03416271721958926, |
|
"grad_norm": 4.397899627685547, |
|
"learning_rate": 1.4736842105263159e-06, |
|
"loss": 1.7544, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.034360189573459717, |
|
"grad_norm": 5.422823429107666, |
|
"learning_rate": 1.4210526315789475e-06, |
|
"loss": 1.5294, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.034557661927330174, |
|
"grad_norm": 5.267470359802246, |
|
"learning_rate": 1.3684210526315791e-06, |
|
"loss": 1.2176, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.03475513428120063, |
|
"grad_norm": 4.583755016326904, |
|
"learning_rate": 1.3157894736842106e-06, |
|
"loss": 1.3583, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.03495260663507109, |
|
"grad_norm": 4.745589733123779, |
|
"learning_rate": 1.2631578947368422e-06, |
|
"loss": 1.6765, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.03515007898894155, |
|
"grad_norm": 4.703863620758057, |
|
"learning_rate": 1.2105263157894738e-06, |
|
"loss": 1.2263, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.035347551342812006, |
|
"grad_norm": 4.113995552062988, |
|
"learning_rate": 1.1578947368421053e-06, |
|
"loss": 1.8098, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.035545023696682464, |
|
"grad_norm": 5.088428020477295, |
|
"learning_rate": 1.1052631578947369e-06, |
|
"loss": 1.7887, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.03574249605055292, |
|
"grad_norm": 4.361863613128662, |
|
"learning_rate": 1.0526315789473685e-06, |
|
"loss": 1.8482, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.03593996840442338, |
|
"grad_norm": 3.7712790966033936, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 1.0318, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.03613744075829384, |
|
"grad_norm": 4.38617467880249, |
|
"learning_rate": 9.473684210526317e-07, |
|
"loss": 1.1747, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.036334913112164295, |
|
"grad_norm": 4.757498741149902, |
|
"learning_rate": 8.947368421052632e-07, |
|
"loss": 1.591, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.03653238546603475, |
|
"grad_norm": 4.661401748657227, |
|
"learning_rate": 8.421052631578948e-07, |
|
"loss": 1.2443, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.03672985781990521, |
|
"grad_norm": 4.178214073181152, |
|
"learning_rate": 7.894736842105263e-07, |
|
"loss": 1.6486, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.03692733017377567, |
|
"grad_norm": 4.652418613433838, |
|
"learning_rate": 7.368421052631579e-07, |
|
"loss": 1.7046, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.03712480252764613, |
|
"grad_norm": 5.367217540740967, |
|
"learning_rate": 6.842105263157896e-07, |
|
"loss": 1.1643, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.037322274881516584, |
|
"grad_norm": 5.026525020599365, |
|
"learning_rate": 6.315789473684211e-07, |
|
"loss": 1.8984, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.03751974723538705, |
|
"grad_norm": 5.15156888961792, |
|
"learning_rate": 5.789473684210526e-07, |
|
"loss": 1.199, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.03771721958925751, |
|
"grad_norm": 4.1339111328125, |
|
"learning_rate": 5.263157894736843e-07, |
|
"loss": 1.2138, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.037914691943127965, |
|
"grad_norm": 4.904068946838379, |
|
"learning_rate": 4.7368421052631585e-07, |
|
"loss": 0.9992, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.03811216429699842, |
|
"grad_norm": 4.655853271484375, |
|
"learning_rate": 4.210526315789474e-07, |
|
"loss": 0.9699, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.03830963665086888, |
|
"grad_norm": 4.786022663116455, |
|
"learning_rate": 3.6842105263157896e-07, |
|
"loss": 1.1047, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.03850710900473934, |
|
"grad_norm": 4.2019362449646, |
|
"learning_rate": 3.1578947368421055e-07, |
|
"loss": 1.3494, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.038704581358609796, |
|
"grad_norm": 4.608132839202881, |
|
"learning_rate": 2.6315789473684213e-07, |
|
"loss": 1.5102, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.038902053712480254, |
|
"grad_norm": 3.955866813659668, |
|
"learning_rate": 2.105263157894737e-07, |
|
"loss": 0.9329, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.03909952606635071, |
|
"grad_norm": 3.948068857192993, |
|
"learning_rate": 1.5789473684210527e-07, |
|
"loss": 0.9115, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.03929699842022117, |
|
"grad_norm": 4.394677639007568, |
|
"learning_rate": 1.0526315789473685e-07, |
|
"loss": 0.842, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.03949447077409163, |
|
"grad_norm": 4.138340473175049, |
|
"learning_rate": 5.263157894736842e-08, |
|
"loss": 1.0818, |
|
"step": 200 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 200, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6305414963527680.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|