{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1188, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008417508417508417, "grad_norm": NaN, "learning_rate": 4.990179573512907e-05, "loss": 3.4797, "step": 10 }, { "epoch": 0.016835016835016835, "grad_norm": 1.8723673820495605, "learning_rate": 4.97615039281706e-05, "loss": 3.4589, "step": 20 }, { "epoch": 0.025252525252525252, "grad_norm": 1.6116461753845215, "learning_rate": 4.962121212121213e-05, "loss": 3.3366, "step": 30 }, { "epoch": 0.03367003367003367, "grad_norm": 1.6241545677185059, "learning_rate": 4.9480920314253646e-05, "loss": 3.0591, "step": 40 }, { "epoch": 0.04208754208754209, "grad_norm": 1.6978979110717773, "learning_rate": 4.934062850729518e-05, "loss": 3.2064, "step": 50 }, { "epoch": 0.050505050505050504, "grad_norm": 2.229762077331543, "learning_rate": 4.92003367003367e-05, "loss": 3.0569, "step": 60 }, { "epoch": 0.058922558922558925, "grad_norm": 1.3693021535873413, "learning_rate": 4.906004489337823e-05, "loss": 2.9715, "step": 70 }, { "epoch": 0.06734006734006734, "grad_norm": 1.712484359741211, "learning_rate": 4.891975308641975e-05, "loss": 2.9787, "step": 80 }, { "epoch": 0.07575757575757576, "grad_norm": 1.933789610862732, "learning_rate": 4.877946127946128e-05, "loss": 2.9016, "step": 90 }, { "epoch": 0.08417508417508418, "grad_norm": 2.0508992671966553, "learning_rate": 4.863916947250281e-05, "loss": 2.93, "step": 100 }, { "epoch": 0.09259259259259259, "grad_norm": 1.8941344022750854, "learning_rate": 4.8498877665544335e-05, "loss": 2.852, "step": 110 }, { "epoch": 0.10101010101010101, "grad_norm": 1.5971471071243286, "learning_rate": 4.835858585858586e-05, "loss": 2.8484, "step": 120 }, { "epoch": 0.10942760942760943, "grad_norm": 1.8011534214019775, "learning_rate": 4.8218294051627386e-05, "loss": 2.8801, "step": 130 }, { "epoch": 0.11784511784511785, "grad_norm": 2.071338176727295, "learning_rate": 4.807800224466891e-05, "loss": 2.7344, "step": 140 }, { "epoch": 0.12626262626262627, "grad_norm": 1.5685261487960815, "learning_rate": 4.793771043771044e-05, "loss": 2.8886, "step": 150 }, { "epoch": 0.13468013468013468, "grad_norm": 1.719056487083435, "learning_rate": 4.779741863075197e-05, "loss": 2.7326, "step": 160 }, { "epoch": 0.14309764309764308, "grad_norm": 1.8598765134811401, "learning_rate": 4.765712682379349e-05, "loss": 2.7544, "step": 170 }, { "epoch": 0.15151515151515152, "grad_norm": 1.7602612972259521, "learning_rate": 4.751683501683502e-05, "loss": 2.7759, "step": 180 }, { "epoch": 0.15993265993265993, "grad_norm": 2.0498409271240234, "learning_rate": 4.7376543209876543e-05, "loss": 2.7495, "step": 190 }, { "epoch": 0.16835016835016836, "grad_norm": 1.8029284477233887, "learning_rate": 4.723625140291807e-05, "loss": 2.6568, "step": 200 }, { "epoch": 0.17676767676767677, "grad_norm": 2.052203416824341, "learning_rate": 4.70959595959596e-05, "loss": 2.6162, "step": 210 }, { "epoch": 0.18518518518518517, "grad_norm": 1.693206787109375, "learning_rate": 4.6955667789001126e-05, "loss": 2.7206, "step": 220 }, { "epoch": 0.1936026936026936, "grad_norm": 1.7983921766281128, "learning_rate": 4.681537598204265e-05, "loss": 2.699, "step": 230 }, { "epoch": 0.20202020202020202, "grad_norm": 2.1269702911376953, "learning_rate": 4.6675084175084176e-05, "loss": 2.8926, "step": 240 }, { "epoch": 0.21043771043771045, "grad_norm": 2.1926703453063965, "learning_rate": 4.65347923681257e-05, "loss": 2.6839, "step": 250 }, { "epoch": 0.21885521885521886, "grad_norm": 1.9844324588775635, "learning_rate": 4.639450056116723e-05, "loss": 2.6166, "step": 260 }, { "epoch": 0.22727272727272727, "grad_norm": 2.1632473468780518, "learning_rate": 4.625420875420876e-05, "loss": 2.6975, "step": 270 }, { "epoch": 0.2356902356902357, "grad_norm": 1.8100026845932007, "learning_rate": 4.6113916947250283e-05, "loss": 2.7679, "step": 280 }, { "epoch": 0.2441077441077441, "grad_norm": 1.765960693359375, "learning_rate": 4.597362514029181e-05, "loss": 2.6372, "step": 290 }, { "epoch": 0.25252525252525254, "grad_norm": 2.0554540157318115, "learning_rate": 4.5833333333333334e-05, "loss": 2.707, "step": 300 }, { "epoch": 0.2609427609427609, "grad_norm": 1.662891149520874, "learning_rate": 4.5693041526374866e-05, "loss": 2.5671, "step": 310 }, { "epoch": 0.26936026936026936, "grad_norm": 1.9311376810073853, "learning_rate": 4.555274971941639e-05, "loss": 2.6173, "step": 320 }, { "epoch": 0.2777777777777778, "grad_norm": 2.077493667602539, "learning_rate": 4.541245791245791e-05, "loss": 2.7469, "step": 330 }, { "epoch": 0.28619528619528617, "grad_norm": 1.8855682611465454, "learning_rate": 4.527216610549944e-05, "loss": 2.6762, "step": 340 }, { "epoch": 0.2946127946127946, "grad_norm": 3.0554871559143066, "learning_rate": 4.5131874298540966e-05, "loss": 2.6689, "step": 350 }, { "epoch": 0.30303030303030304, "grad_norm": 2.175938129425049, "learning_rate": 4.49915824915825e-05, "loss": 2.5579, "step": 360 }, { "epoch": 0.3114478114478115, "grad_norm": 2.187106132507324, "learning_rate": 4.485129068462402e-05, "loss": 2.6166, "step": 370 }, { "epoch": 0.31986531986531985, "grad_norm": 2.1227610111236572, "learning_rate": 4.471099887766554e-05, "loss": 2.6637, "step": 380 }, { "epoch": 0.3282828282828283, "grad_norm": 2.3033812046051025, "learning_rate": 4.4570707070707074e-05, "loss": 2.6204, "step": 390 }, { "epoch": 0.3367003367003367, "grad_norm": 2.34395432472229, "learning_rate": 4.44304152637486e-05, "loss": 2.6941, "step": 400 }, { "epoch": 0.3451178451178451, "grad_norm": 1.7339164018630981, "learning_rate": 4.429012345679013e-05, "loss": 2.5516, "step": 410 }, { "epoch": 0.35353535353535354, "grad_norm": 2.3480944633483887, "learning_rate": 4.414983164983165e-05, "loss": 2.5533, "step": 420 }, { "epoch": 0.36195286195286197, "grad_norm": 2.076554298400879, "learning_rate": 4.4009539842873175e-05, "loss": 2.5942, "step": 430 }, { "epoch": 0.37037037037037035, "grad_norm": 2.461144208908081, "learning_rate": 4.3869248035914707e-05, "loss": 2.7073, "step": 440 }, { "epoch": 0.3787878787878788, "grad_norm": 2.3409440517425537, "learning_rate": 4.372895622895623e-05, "loss": 2.551, "step": 450 }, { "epoch": 0.3872053872053872, "grad_norm": 2.0938796997070312, "learning_rate": 4.358866442199776e-05, "loss": 2.5157, "step": 460 }, { "epoch": 0.3956228956228956, "grad_norm": 2.0075490474700928, "learning_rate": 4.344837261503928e-05, "loss": 2.6791, "step": 470 }, { "epoch": 0.40404040404040403, "grad_norm": 2.0483977794647217, "learning_rate": 4.330808080808081e-05, "loss": 2.5285, "step": 480 }, { "epoch": 0.41245791245791247, "grad_norm": 2.0210118293762207, "learning_rate": 4.316778900112234e-05, "loss": 2.4894, "step": 490 }, { "epoch": 0.4208754208754209, "grad_norm": 2.2782247066497803, "learning_rate": 4.3027497194163864e-05, "loss": 2.541, "step": 500 }, { "epoch": 0.4292929292929293, "grad_norm": 2.6043848991394043, "learning_rate": 4.288720538720539e-05, "loss": 2.5162, "step": 510 }, { "epoch": 0.4377104377104377, "grad_norm": 2.8767223358154297, "learning_rate": 4.2746913580246915e-05, "loss": 2.5854, "step": 520 }, { "epoch": 0.44612794612794615, "grad_norm": 2.8873238563537598, "learning_rate": 4.260662177328844e-05, "loss": 2.6396, "step": 530 }, { "epoch": 0.45454545454545453, "grad_norm": 2.052957057952881, "learning_rate": 4.246632996632997e-05, "loss": 2.4754, "step": 540 }, { "epoch": 0.46296296296296297, "grad_norm": 2.027665376663208, "learning_rate": 4.23260381593715e-05, "loss": 2.4765, "step": 550 }, { "epoch": 0.4713804713804714, "grad_norm": 2.368720293045044, "learning_rate": 4.218574635241302e-05, "loss": 2.4542, "step": 560 }, { "epoch": 0.4797979797979798, "grad_norm": 2.33085298538208, "learning_rate": 4.204545454545455e-05, "loss": 2.4622, "step": 570 }, { "epoch": 0.4882154882154882, "grad_norm": 2.237562417984009, "learning_rate": 4.190516273849607e-05, "loss": 2.5542, "step": 580 }, { "epoch": 0.49663299663299665, "grad_norm": 2.0441579818725586, "learning_rate": 4.17648709315376e-05, "loss": 2.4717, "step": 590 }, { "epoch": 0.5050505050505051, "grad_norm": 2.248406171798706, "learning_rate": 4.162457912457913e-05, "loss": 2.4661, "step": 600 }, { "epoch": 0.5134680134680135, "grad_norm": 2.1775243282318115, "learning_rate": 4.1484287317620655e-05, "loss": 2.5614, "step": 610 }, { "epoch": 0.5218855218855218, "grad_norm": 2.319425344467163, "learning_rate": 4.134399551066218e-05, "loss": 2.5012, "step": 620 }, { "epoch": 0.5303030303030303, "grad_norm": 2.232248544692993, "learning_rate": 4.1203703703703705e-05, "loss": 2.5689, "step": 630 }, { "epoch": 0.5387205387205387, "grad_norm": 2.3449838161468506, "learning_rate": 4.106341189674523e-05, "loss": 2.4287, "step": 640 }, { "epoch": 0.5471380471380471, "grad_norm": 2.412785768508911, "learning_rate": 4.092312008978676e-05, "loss": 2.5041, "step": 650 }, { "epoch": 0.5555555555555556, "grad_norm": 2.432955503463745, "learning_rate": 4.078282828282828e-05, "loss": 2.4476, "step": 660 }, { "epoch": 0.563973063973064, "grad_norm": 2.642298698425293, "learning_rate": 4.064253647586981e-05, "loss": 2.421, "step": 670 }, { "epoch": 0.5723905723905723, "grad_norm": 2.232794761657715, "learning_rate": 4.050224466891134e-05, "loss": 2.5912, "step": 680 }, { "epoch": 0.5808080808080808, "grad_norm": 2.8323323726654053, "learning_rate": 4.036195286195286e-05, "loss": 2.3986, "step": 690 }, { "epoch": 0.5892255892255892, "grad_norm": 4.2033891677856445, "learning_rate": 4.0221661054994395e-05, "loss": 2.5413, "step": 700 }, { "epoch": 0.5976430976430976, "grad_norm": 2.3608999252319336, "learning_rate": 4.008136924803591e-05, "loss": 2.5453, "step": 710 }, { "epoch": 0.6060606060606061, "grad_norm": 2.5277252197265625, "learning_rate": 3.9941077441077445e-05, "loss": 2.4956, "step": 720 }, { "epoch": 0.6144781144781145, "grad_norm": 2.5772793292999268, "learning_rate": 3.980078563411897e-05, "loss": 2.5238, "step": 730 }, { "epoch": 0.622895622895623, "grad_norm": 2.718047618865967, "learning_rate": 3.9660493827160496e-05, "loss": 2.4327, "step": 740 }, { "epoch": 0.6313131313131313, "grad_norm": 2.5964958667755127, "learning_rate": 3.952020202020202e-05, "loss": 2.5208, "step": 750 }, { "epoch": 0.6397306397306397, "grad_norm": 2.4600930213928223, "learning_rate": 3.9379910213243546e-05, "loss": 2.6661, "step": 760 }, { "epoch": 0.6481481481481481, "grad_norm": 2.449575662612915, "learning_rate": 3.923961840628507e-05, "loss": 2.5442, "step": 770 }, { "epoch": 0.6565656565656566, "grad_norm": 2.3219118118286133, "learning_rate": 3.90993265993266e-05, "loss": 2.3805, "step": 780 }, { "epoch": 0.664983164983165, "grad_norm": 2.774872064590454, "learning_rate": 3.895903479236813e-05, "loss": 2.4343, "step": 790 }, { "epoch": 0.6734006734006734, "grad_norm": 2.466688632965088, "learning_rate": 3.881874298540965e-05, "loss": 2.4088, "step": 800 }, { "epoch": 0.6818181818181818, "grad_norm": 3.1128170490264893, "learning_rate": 3.867845117845118e-05, "loss": 2.5768, "step": 810 }, { "epoch": 0.6902356902356902, "grad_norm": 4.245325088500977, "learning_rate": 3.8538159371492704e-05, "loss": 2.3917, "step": 820 }, { "epoch": 0.6986531986531986, "grad_norm": 2.6386542320251465, "learning_rate": 3.8397867564534236e-05, "loss": 2.3603, "step": 830 }, { "epoch": 0.7070707070707071, "grad_norm": 3.0349717140197754, "learning_rate": 3.825757575757576e-05, "loss": 2.5057, "step": 840 }, { "epoch": 0.7154882154882155, "grad_norm": 2.524481773376465, "learning_rate": 3.8117283950617286e-05, "loss": 2.5321, "step": 850 }, { "epoch": 0.7239057239057239, "grad_norm": 3.5061841011047363, "learning_rate": 3.797699214365881e-05, "loss": 2.6322, "step": 860 }, { "epoch": 0.7323232323232324, "grad_norm": 2.5284855365753174, "learning_rate": 3.7836700336700336e-05, "loss": 2.5965, "step": 870 }, { "epoch": 0.7407407407407407, "grad_norm": 2.923661231994629, "learning_rate": 3.769640852974187e-05, "loss": 2.4343, "step": 880 }, { "epoch": 0.7491582491582491, "grad_norm": 2.788609743118286, "learning_rate": 3.7556116722783393e-05, "loss": 2.4594, "step": 890 }, { "epoch": 0.7575757575757576, "grad_norm": 2.8142261505126953, "learning_rate": 3.741582491582492e-05, "loss": 2.4316, "step": 900 }, { "epoch": 0.765993265993266, "grad_norm": 2.7626090049743652, "learning_rate": 3.7275533108866444e-05, "loss": 2.454, "step": 910 }, { "epoch": 0.7744107744107744, "grad_norm": 2.700515031814575, "learning_rate": 3.713524130190797e-05, "loss": 2.5807, "step": 920 }, { "epoch": 0.7828282828282829, "grad_norm": 2.9090945720672607, "learning_rate": 3.69949494949495e-05, "loss": 2.461, "step": 930 }, { "epoch": 0.7912457912457912, "grad_norm": 2.439922332763672, "learning_rate": 3.6854657687991026e-05, "loss": 2.454, "step": 940 }, { "epoch": 0.7996632996632996, "grad_norm": 2.9203062057495117, "learning_rate": 3.6714365881032544e-05, "loss": 2.4361, "step": 950 }, { "epoch": 0.8080808080808081, "grad_norm": 2.6928439140319824, "learning_rate": 3.6574074074074076e-05, "loss": 2.4572, "step": 960 }, { "epoch": 0.8164983164983165, "grad_norm": 2.66032338142395, "learning_rate": 3.64337822671156e-05, "loss": 2.4882, "step": 970 }, { "epoch": 0.8249158249158249, "grad_norm": 2.4645493030548096, "learning_rate": 3.6293490460157134e-05, "loss": 2.4432, "step": 980 }, { "epoch": 0.8333333333333334, "grad_norm": 2.7508137226104736, "learning_rate": 3.615319865319866e-05, "loss": 2.3677, "step": 990 }, { "epoch": 0.8417508417508418, "grad_norm": 3.1264898777008057, "learning_rate": 3.601290684624018e-05, "loss": 2.444, "step": 1000 }, { "epoch": 0.8501683501683501, "grad_norm": 2.5735745429992676, "learning_rate": 3.587261503928171e-05, "loss": 2.4514, "step": 1010 }, { "epoch": 0.8585858585858586, "grad_norm": 2.614163637161255, "learning_rate": 3.5732323232323234e-05, "loss": 2.4012, "step": 1020 }, { "epoch": 0.867003367003367, "grad_norm": 3.058293342590332, "learning_rate": 3.559203142536476e-05, "loss": 2.5346, "step": 1030 }, { "epoch": 0.8754208754208754, "grad_norm": 2.299180507659912, "learning_rate": 3.5451739618406285e-05, "loss": 2.4102, "step": 1040 }, { "epoch": 0.8838383838383839, "grad_norm": 2.3540875911712646, "learning_rate": 3.531144781144781e-05, "loss": 2.3884, "step": 1050 }, { "epoch": 0.8922558922558923, "grad_norm": 2.4985461235046387, "learning_rate": 3.517115600448934e-05, "loss": 2.5071, "step": 1060 }, { "epoch": 0.9006734006734006, "grad_norm": 2.615652322769165, "learning_rate": 3.503086419753087e-05, "loss": 2.4177, "step": 1070 }, { "epoch": 0.9090909090909091, "grad_norm": 2.617262601852417, "learning_rate": 3.489057239057239e-05, "loss": 2.3434, "step": 1080 }, { "epoch": 0.9175084175084175, "grad_norm": 2.422133445739746, "learning_rate": 3.475028058361392e-05, "loss": 2.4621, "step": 1090 }, { "epoch": 0.9259259259259259, "grad_norm": 2.65753173828125, "learning_rate": 3.460998877665544e-05, "loss": 2.398, "step": 1100 }, { "epoch": 0.9343434343434344, "grad_norm": 2.8853819370269775, "learning_rate": 3.4469696969696974e-05, "loss": 2.5412, "step": 1110 }, { "epoch": 0.9427609427609428, "grad_norm": 2.287621021270752, "learning_rate": 3.43294051627385e-05, "loss": 2.3842, "step": 1120 }, { "epoch": 0.9511784511784511, "grad_norm": 2.8708698749542236, "learning_rate": 3.4189113355780025e-05, "loss": 2.365, "step": 1130 }, { "epoch": 0.9595959595959596, "grad_norm": 2.9824109077453613, "learning_rate": 3.404882154882155e-05, "loss": 2.5138, "step": 1140 }, { "epoch": 0.968013468013468, "grad_norm": 3.031399726867676, "learning_rate": 3.3908529741863075e-05, "loss": 2.4326, "step": 1150 }, { "epoch": 0.9764309764309764, "grad_norm": 2.725060224533081, "learning_rate": 3.37682379349046e-05, "loss": 2.3802, "step": 1160 }, { "epoch": 0.9848484848484849, "grad_norm": 2.43306303024292, "learning_rate": 3.362794612794613e-05, "loss": 2.4778, "step": 1170 }, { "epoch": 0.9932659932659933, "grad_norm": 2.6997814178466797, "learning_rate": 3.348765432098766e-05, "loss": 2.4728, "step": 1180 } ], "logging_steps": 10, "max_steps": 3564, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2549717436727296.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }