{ "best_metric": 1.6930413246154785, "best_model_checkpoint": "./FT_models/[LDH]0221_all_llama31_docs/checkpoint-500", "epoch": 68.63366336633663, "eval_steps": 500, "global_step": 3500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.19801980198019803, "grad_norm": 0.4907771050930023, "learning_rate": 0.0001999995055317446, "loss": 1.9296, "step": 10 }, { "epoch": 0.39603960396039606, "grad_norm": 0.4170154929161072, "learning_rate": 0.0001999955498150411, "loss": 1.5182, "step": 20 }, { "epoch": 0.594059405940594, "grad_norm": 0.6515398025512695, "learning_rate": 0.00019998763853811184, "loss": 1.3992, "step": 30 }, { "epoch": 0.7920792079207921, "grad_norm": 0.6787588596343994, "learning_rate": 0.00019997577201390606, "loss": 1.3514, "step": 40 }, { "epoch": 0.9900990099009901, "grad_norm": 0.6807988286018372, "learning_rate": 0.0001999599507118322, "loss": 1.1463, "step": 50 }, { "epoch": 1.1782178217821782, "grad_norm": 0.7439409494400024, "learning_rate": 0.00019994017525773913, "loss": 0.9557, "step": 60 }, { "epoch": 1.3762376237623761, "grad_norm": 0.8682450652122498, "learning_rate": 0.0001999164464338918, "loss": 0.8612, "step": 70 }, { "epoch": 1.5742574257425743, "grad_norm": 0.9405132532119751, "learning_rate": 0.0001998887651789398, "loss": 0.713, "step": 80 }, { "epoch": 1.7722772277227723, "grad_norm": 1.0701061487197876, "learning_rate": 0.0001998571325878806, "loss": 0.6861, "step": 90 }, { "epoch": 1.9702970297029703, "grad_norm": 0.8982220888137817, "learning_rate": 0.00019982154991201608, "loss": 0.5632, "step": 100 }, { "epoch": 2.1584158415841586, "grad_norm": 0.8719994425773621, "learning_rate": 0.00019978201855890308, "loss": 0.4271, "step": 110 }, { "epoch": 2.3564356435643563, "grad_norm": 0.9812946319580078, "learning_rate": 0.00019973854009229763, "loss": 0.316, "step": 120 }, { "epoch": 2.5544554455445545, "grad_norm": 0.9043093323707581, "learning_rate": 0.00019969111623209323, "loss": 0.2683, "step": 130 }, { "epoch": 2.7524752475247523, "grad_norm": 0.7294723987579346, "learning_rate": 0.00019963974885425266, "loss": 0.2439, "step": 140 }, { "epoch": 2.9504950495049505, "grad_norm": 0.8509527444839478, "learning_rate": 0.00019958443999073397, "loss": 0.2409, "step": 150 }, { "epoch": 3.1386138613861387, "grad_norm": 0.6639506816864014, "learning_rate": 0.00019952519182940993, "loss": 0.1796, "step": 160 }, { "epoch": 3.3366336633663365, "grad_norm": 0.64533531665802, "learning_rate": 0.0001994620067139815, "loss": 0.1456, "step": 170 }, { "epoch": 3.5346534653465347, "grad_norm": 0.7491998672485352, "learning_rate": 0.00019939488714388524, "loss": 0.114, "step": 180 }, { "epoch": 3.7326732673267324, "grad_norm": 0.49455082416534424, "learning_rate": 0.00019932383577419432, "loss": 0.1028, "step": 190 }, { "epoch": 3.9306930693069306, "grad_norm": 0.5427107214927673, "learning_rate": 0.0001992488554155135, "loss": 0.1218, "step": 200 }, { "epoch": 4.118811881188119, "grad_norm": 0.3893876373767853, "learning_rate": 0.0001991699490338681, "loss": 0.0833, "step": 210 }, { "epoch": 4.316831683168317, "grad_norm": 0.4123828113079071, "learning_rate": 0.00019908711975058637, "loss": 0.0684, "step": 220 }, { "epoch": 4.514851485148515, "grad_norm": 0.5183358788490295, "learning_rate": 0.00019900037084217637, "loss": 0.0741, "step": 230 }, { "epoch": 4.712871287128713, "grad_norm": 0.3828432261943817, "learning_rate": 0.00019890970574019617, "loss": 0.0607, "step": 240 }, { "epoch": 4.910891089108911, "grad_norm": 0.30472326278686523, "learning_rate": 0.00019881512803111796, "loss": 0.0607, "step": 250 }, { "epoch": 5.099009900990099, "grad_norm": 0.3620370328426361, "learning_rate": 0.00019871664145618657, "loss": 0.0545, "step": 260 }, { "epoch": 5.297029702970297, "grad_norm": 0.3857772946357727, "learning_rate": 0.00019861424991127115, "loss": 0.0394, "step": 270 }, { "epoch": 5.4950495049504955, "grad_norm": 0.22827056050300598, "learning_rate": 0.00019850795744671116, "loss": 0.0404, "step": 280 }, { "epoch": 5.693069306930693, "grad_norm": 0.38091033697128296, "learning_rate": 0.00019839776826715614, "loss": 0.0406, "step": 290 }, { "epoch": 5.891089108910891, "grad_norm": 0.4117518663406372, "learning_rate": 0.00019828368673139947, "loss": 0.0414, "step": 300 }, { "epoch": 6.079207920792079, "grad_norm": 0.32705605030059814, "learning_rate": 0.00019816571735220583, "loss": 0.038, "step": 310 }, { "epoch": 6.2772277227722775, "grad_norm": 0.3450210988521576, "learning_rate": 0.0001980438647961327, "loss": 0.0305, "step": 320 }, { "epoch": 6.475247524752476, "grad_norm": 0.29956933856010437, "learning_rate": 0.00019791813388334581, "loss": 0.0301, "step": 330 }, { "epoch": 6.673267326732673, "grad_norm": 0.39558252692222595, "learning_rate": 0.00019778852958742853, "loss": 0.0312, "step": 340 }, { "epoch": 6.871287128712871, "grad_norm": 0.29619818925857544, "learning_rate": 0.00019765505703518496, "loss": 0.0364, "step": 350 }, { "epoch": 7.0594059405940595, "grad_norm": 0.2381938099861145, "learning_rate": 0.00019751772150643722, "loss": 0.0307, "step": 360 }, { "epoch": 7.257425742574258, "grad_norm": 0.38348841667175293, "learning_rate": 0.0001973765284338167, "loss": 0.0269, "step": 370 }, { "epoch": 7.455445544554456, "grad_norm": 0.25046518445014954, "learning_rate": 0.00019723148340254892, "loss": 0.0233, "step": 380 }, { "epoch": 7.653465346534653, "grad_norm": 0.27623361349105835, "learning_rate": 0.0001970825921502328, "loss": 0.0285, "step": 390 }, { "epoch": 7.851485148514851, "grad_norm": 0.2806844413280487, "learning_rate": 0.00019692986056661356, "loss": 0.0284, "step": 400 }, { "epoch": 8.03960396039604, "grad_norm": 0.31793248653411865, "learning_rate": 0.0001967732946933499, "loss": 0.0298, "step": 410 }, { "epoch": 8.237623762376238, "grad_norm": 0.26446977257728577, "learning_rate": 0.00019661290072377482, "loss": 0.0242, "step": 420 }, { "epoch": 8.435643564356436, "grad_norm": 0.21607118844985962, "learning_rate": 0.0001964486850026507, "loss": 0.0212, "step": 430 }, { "epoch": 8.633663366336634, "grad_norm": 0.3113400340080261, "learning_rate": 0.00019628065402591845, "loss": 0.0223, "step": 440 }, { "epoch": 8.831683168316832, "grad_norm": 0.21292166411876678, "learning_rate": 0.0001961088144404403, "loss": 0.0251, "step": 450 }, { "epoch": 9.01980198019802, "grad_norm": 0.2799708843231201, "learning_rate": 0.00019593317304373705, "loss": 0.0222, "step": 460 }, { "epoch": 9.217821782178218, "grad_norm": 0.3010728061199188, "learning_rate": 0.00019575373678371909, "loss": 0.0192, "step": 470 }, { "epoch": 9.415841584158416, "grad_norm": 0.26538291573524475, "learning_rate": 0.0001955705127584117, "loss": 0.0208, "step": 480 }, { "epoch": 9.613861386138614, "grad_norm": 0.29595261812210083, "learning_rate": 0.00019538350821567404, "loss": 0.0211, "step": 490 }, { "epoch": 9.811881188118813, "grad_norm": 0.28233084082603455, "learning_rate": 0.00019519273055291266, "loss": 0.0248, "step": 500 }, { "epoch": 9.811881188118813, "eval_loss": 1.6930413246154785, "eval_runtime": 78.4865, "eval_samples_per_second": 4.039, "eval_steps_per_second": 2.026, "step": 500 }, { "epoch": 10.0, "grad_norm": 0.392619788646698, "learning_rate": 0.00019499818731678873, "loss": 0.0263, "step": 510 }, { "epoch": 10.198019801980198, "grad_norm": 0.18203061819076538, "learning_rate": 0.00019479988620291956, "loss": 0.0197, "step": 520 }, { "epoch": 10.396039603960396, "grad_norm": 0.4150699973106384, "learning_rate": 0.00019459783505557424, "loss": 0.0244, "step": 530 }, { "epoch": 10.594059405940595, "grad_norm": 0.1509980857372284, "learning_rate": 0.0001943920418673633, "loss": 0.021, "step": 540 }, { "epoch": 10.792079207920793, "grad_norm": 0.4003594219684601, "learning_rate": 0.0001941825147789225, "loss": 0.0224, "step": 550 }, { "epoch": 10.990099009900991, "grad_norm": 0.18431659042835236, "learning_rate": 0.00019396926207859084, "loss": 0.0221, "step": 560 }, { "epoch": 11.178217821782178, "grad_norm": 0.15787458419799805, "learning_rate": 0.00019375229220208276, "loss": 0.0193, "step": 570 }, { "epoch": 11.376237623762377, "grad_norm": 0.3416723608970642, "learning_rate": 0.0001935316137321543, "loss": 0.0209, "step": 580 }, { "epoch": 11.574257425742575, "grad_norm": 0.486870139837265, "learning_rate": 0.00019330723539826375, "loss": 0.021, "step": 590 }, { "epoch": 11.772277227722773, "grad_norm": 0.25933241844177246, "learning_rate": 0.0001930791660762262, "loss": 0.0198, "step": 600 }, { "epoch": 11.97029702970297, "grad_norm": 0.10881418734788895, "learning_rate": 0.0001928474147878626, "loss": 0.02, "step": 610 }, { "epoch": 12.158415841584159, "grad_norm": 0.19103562831878662, "learning_rate": 0.0001926119907006426, "loss": 0.0178, "step": 620 }, { "epoch": 12.356435643564357, "grad_norm": 0.1987970769405365, "learning_rate": 0.00019237290312732226, "loss": 0.0188, "step": 630 }, { "epoch": 12.554455445544555, "grad_norm": 0.45853590965270996, "learning_rate": 0.0001921301615255754, "loss": 0.0182, "step": 640 }, { "epoch": 12.752475247524753, "grad_norm": 0.22175024449825287, "learning_rate": 0.00019188377549761963, "loss": 0.02, "step": 650 }, { "epoch": 12.950495049504951, "grad_norm": 0.45677581429481506, "learning_rate": 0.00019163375478983632, "loss": 0.0231, "step": 660 }, { "epoch": 13.138613861386139, "grad_norm": 0.3287411630153656, "learning_rate": 0.00019138010929238534, "loss": 0.0215, "step": 670 }, { "epoch": 13.336633663366337, "grad_norm": 0.43493467569351196, "learning_rate": 0.0001911228490388136, "loss": 0.021, "step": 680 }, { "epoch": 13.534653465346535, "grad_norm": 0.15248054265975952, "learning_rate": 0.00019086198420565823, "loss": 0.0201, "step": 690 }, { "epoch": 13.732673267326733, "grad_norm": 0.17455808818340302, "learning_rate": 0.000190597525112044, "loss": 0.0225, "step": 700 }, { "epoch": 13.930693069306932, "grad_norm": 0.2868499457836151, "learning_rate": 0.00019032948221927524, "loss": 0.0195, "step": 710 }, { "epoch": 14.118811881188119, "grad_norm": 0.20362168550491333, "learning_rate": 0.00019005786613042185, "loss": 0.0211, "step": 720 }, { "epoch": 14.316831683168317, "grad_norm": 0.17784035205841064, "learning_rate": 0.00018978268758989991, "loss": 0.0217, "step": 730 }, { "epoch": 14.514851485148515, "grad_norm": 0.1722538322210312, "learning_rate": 0.00018950395748304678, "loss": 0.0178, "step": 740 }, { "epoch": 14.712871287128714, "grad_norm": 0.3050313889980316, "learning_rate": 0.0001892216868356904, "loss": 0.0252, "step": 750 }, { "epoch": 14.910891089108912, "grad_norm": 0.20032328367233276, "learning_rate": 0.00018893588681371303, "loss": 0.0191, "step": 760 }, { "epoch": 15.099009900990099, "grad_norm": 0.2856597304344177, "learning_rate": 0.00018864656872260985, "loss": 0.0196, "step": 770 }, { "epoch": 15.297029702970297, "grad_norm": 0.1927269697189331, "learning_rate": 0.00018835374400704154, "loss": 0.0177, "step": 780 }, { "epoch": 15.495049504950495, "grad_norm": 0.10624806582927704, "learning_rate": 0.00018805742425038145, "loss": 0.0187, "step": 790 }, { "epoch": 15.693069306930694, "grad_norm": 0.17643067240715027, "learning_rate": 0.00018775762117425777, "loss": 0.018, "step": 800 }, { "epoch": 15.891089108910892, "grad_norm": 0.25009414553642273, "learning_rate": 0.00018745434663808942, "loss": 0.0196, "step": 810 }, { "epoch": 16.07920792079208, "grad_norm": 0.13281401991844177, "learning_rate": 0.00018714761263861728, "loss": 0.0153, "step": 820 }, { "epoch": 16.277227722772277, "grad_norm": 0.20706744492053986, "learning_rate": 0.00018683743130942928, "loss": 0.0153, "step": 830 }, { "epoch": 16.475247524752476, "grad_norm": 0.26562607288360596, "learning_rate": 0.00018652381492048083, "loss": 0.0161, "step": 840 }, { "epoch": 16.673267326732674, "grad_norm": 0.24612267315387726, "learning_rate": 0.00018620677587760916, "loss": 0.017, "step": 850 }, { "epoch": 16.871287128712872, "grad_norm": 0.12470393627882004, "learning_rate": 0.00018588632672204264, "loss": 0.0163, "step": 860 }, { "epoch": 17.059405940594058, "grad_norm": 0.16730207204818726, "learning_rate": 0.00018556248012990468, "loss": 0.0164, "step": 870 }, { "epoch": 17.257425742574256, "grad_norm": 0.22669559717178345, "learning_rate": 0.0001852352489117124, "loss": 0.0185, "step": 880 }, { "epoch": 17.455445544554454, "grad_norm": 0.18988390266895294, "learning_rate": 0.0001849046460118698, "loss": 0.0168, "step": 890 }, { "epoch": 17.653465346534652, "grad_norm": 0.1657964289188385, "learning_rate": 0.00018457068450815562, "loss": 0.0169, "step": 900 }, { "epoch": 17.85148514851485, "grad_norm": 0.12119423598051071, "learning_rate": 0.00018423337761120618, "loss": 0.0173, "step": 910 }, { "epoch": 18.03960396039604, "grad_norm": 0.10356303304433823, "learning_rate": 0.00018389273866399275, "loss": 0.0159, "step": 920 }, { "epoch": 18.237623762376238, "grad_norm": 0.09106975793838501, "learning_rate": 0.00018354878114129367, "loss": 0.0145, "step": 930 }, { "epoch": 18.435643564356436, "grad_norm": 0.11265785992145538, "learning_rate": 0.00018320151864916135, "loss": 0.0142, "step": 940 }, { "epoch": 18.633663366336634, "grad_norm": 0.1934558004140854, "learning_rate": 0.00018285096492438424, "loss": 0.0146, "step": 950 }, { "epoch": 18.831683168316832, "grad_norm": 0.09819146245718002, "learning_rate": 0.00018249713383394303, "loss": 0.0163, "step": 960 }, { "epoch": 19.019801980198018, "grad_norm": 0.17335668206214905, "learning_rate": 0.00018214003937446253, "loss": 0.0165, "step": 970 }, { "epoch": 19.217821782178216, "grad_norm": 0.10033983737230301, "learning_rate": 0.0001817796956716578, "loss": 0.0141, "step": 980 }, { "epoch": 19.415841584158414, "grad_norm": 0.10356291383504868, "learning_rate": 0.00018141611697977529, "loss": 0.0138, "step": 990 }, { "epoch": 19.613861386138613, "grad_norm": 0.16795016825199127, "learning_rate": 0.0001810493176810292, "loss": 0.0148, "step": 1000 }, { "epoch": 19.613861386138613, "eval_loss": 1.8084672689437866, "eval_runtime": 82.2773, "eval_samples_per_second": 3.853, "eval_steps_per_second": 1.932, "step": 1000 }, { "epoch": 19.81188118811881, "grad_norm": 0.23259301483631134, "learning_rate": 0.00018067931228503246, "loss": 0.016, "step": 1010 }, { "epoch": 20.0, "grad_norm": 0.07684651017189026, "learning_rate": 0.00018030611542822257, "loss": 0.0148, "step": 1020 }, { "epoch": 20.198019801980198, "grad_norm": 0.16659528017044067, "learning_rate": 0.00017992974187328305, "loss": 0.0127, "step": 1030 }, { "epoch": 20.396039603960396, "grad_norm": 0.25504982471466064, "learning_rate": 0.000179550206508559, "loss": 0.0125, "step": 1040 }, { "epoch": 20.594059405940595, "grad_norm": 0.13706262409687042, "learning_rate": 0.00017916752434746856, "loss": 0.0139, "step": 1050 }, { "epoch": 20.792079207920793, "grad_norm": 0.09546064585447311, "learning_rate": 0.00017878171052790868, "loss": 0.0143, "step": 1060 }, { "epoch": 20.99009900990099, "grad_norm": 0.19178028404712677, "learning_rate": 0.00017839278031165658, "loss": 0.0146, "step": 1070 }, { "epoch": 21.178217821782177, "grad_norm": 0.09425017982721329, "learning_rate": 0.00017800074908376584, "loss": 0.0137, "step": 1080 }, { "epoch": 21.376237623762375, "grad_norm": 0.0950135812163353, "learning_rate": 0.0001776056323519579, "loss": 0.0142, "step": 1090 }, { "epoch": 21.574257425742573, "grad_norm": 0.11520115286111832, "learning_rate": 0.00017720744574600863, "loss": 0.0147, "step": 1100 }, { "epoch": 21.77227722772277, "grad_norm": 0.07679902017116547, "learning_rate": 0.00017680620501712996, "loss": 0.013, "step": 1110 }, { "epoch": 21.97029702970297, "grad_norm": 0.0910814180970192, "learning_rate": 0.00017640192603734692, "loss": 0.0139, "step": 1120 }, { "epoch": 22.15841584158416, "grad_norm": 0.18983718752861023, "learning_rate": 0.00017599462479886974, "loss": 0.0119, "step": 1130 }, { "epoch": 22.356435643564357, "grad_norm": 0.05227164551615715, "learning_rate": 0.00017558431741346122, "loss": 0.0121, "step": 1140 }, { "epoch": 22.554455445544555, "grad_norm": 0.10026168078184128, "learning_rate": 0.00017517102011179933, "loss": 0.0128, "step": 1150 }, { "epoch": 22.752475247524753, "grad_norm": 0.10607390105724335, "learning_rate": 0.00017475474924283536, "loss": 0.0126, "step": 1160 }, { "epoch": 22.95049504950495, "grad_norm": 0.16404001414775848, "learning_rate": 0.000174335521273147, "loss": 0.013, "step": 1170 }, { "epoch": 23.138613861386137, "grad_norm": 0.09486360847949982, "learning_rate": 0.00017391335278628712, "loss": 0.0122, "step": 1180 }, { "epoch": 23.336633663366335, "grad_norm": 0.05201767012476921, "learning_rate": 0.0001734882604821276, "loss": 0.0125, "step": 1190 }, { "epoch": 23.534653465346533, "grad_norm": 0.34572598338127136, "learning_rate": 0.00017306026117619889, "loss": 0.0168, "step": 1200 }, { "epoch": 23.73267326732673, "grad_norm": 0.22629716992378235, "learning_rate": 0.00017262937179902472, "loss": 0.0143, "step": 1210 }, { "epoch": 23.93069306930693, "grad_norm": 0.2683790922164917, "learning_rate": 0.00017219560939545246, "loss": 0.0159, "step": 1220 }, { "epoch": 24.11881188118812, "grad_norm": 0.17118693888187408, "learning_rate": 0.0001717589911239788, "loss": 0.016, "step": 1230 }, { "epoch": 24.316831683168317, "grad_norm": 0.105218805372715, "learning_rate": 0.00017131953425607104, "loss": 0.016, "step": 1240 }, { "epoch": 24.514851485148515, "grad_norm": 0.20184026658535004, "learning_rate": 0.00017087725617548385, "loss": 0.0183, "step": 1250 }, { "epoch": 24.712871287128714, "grad_norm": 0.2845208942890167, "learning_rate": 0.00017043217437757164, "loss": 0.0189, "step": 1260 }, { "epoch": 24.91089108910891, "grad_norm": 0.2577328383922577, "learning_rate": 0.00016998430646859654, "loss": 0.0196, "step": 1270 }, { "epoch": 25.099009900990097, "grad_norm": 0.22288930416107178, "learning_rate": 0.00016953367016503182, "loss": 0.0177, "step": 1280 }, { "epoch": 25.297029702970296, "grad_norm": 0.36929747462272644, "learning_rate": 0.00016908028329286112, "loss": 0.0202, "step": 1290 }, { "epoch": 25.495049504950494, "grad_norm": 0.2899525463581085, "learning_rate": 0.0001686241637868734, "loss": 0.0213, "step": 1300 }, { "epoch": 25.693069306930692, "grad_norm": 0.32868900895118713, "learning_rate": 0.00016816532968995328, "loss": 0.0182, "step": 1310 }, { "epoch": 25.89108910891089, "grad_norm": 0.3192131817340851, "learning_rate": 0.00016770379915236766, "loss": 0.0218, "step": 1320 }, { "epoch": 26.07920792079208, "grad_norm": 0.26361629366874695, "learning_rate": 0.00016723959043104728, "loss": 0.0179, "step": 1330 }, { "epoch": 26.277227722772277, "grad_norm": 0.3538498282432556, "learning_rate": 0.00016677272188886483, "loss": 0.0168, "step": 1340 }, { "epoch": 26.475247524752476, "grad_norm": 0.19713373482227325, "learning_rate": 0.00016630321199390867, "loss": 0.0173, "step": 1350 }, { "epoch": 26.673267326732674, "grad_norm": 0.29515835642814636, "learning_rate": 0.00016583107931875192, "loss": 0.0177, "step": 1360 }, { "epoch": 26.871287128712872, "grad_norm": 0.09990435093641281, "learning_rate": 0.00016535634253971794, "loss": 0.0167, "step": 1370 }, { "epoch": 27.059405940594058, "grad_norm": 0.308371365070343, "learning_rate": 0.00016487902043614173, "loss": 0.0165, "step": 1380 }, { "epoch": 27.257425742574256, "grad_norm": 0.2830803394317627, "learning_rate": 0.00016439913188962685, "loss": 0.0155, "step": 1390 }, { "epoch": 27.455445544554454, "grad_norm": 0.21548472344875336, "learning_rate": 0.0001639166958832985, "loss": 0.0163, "step": 1400 }, { "epoch": 27.653465346534652, "grad_norm": 0.1809612214565277, "learning_rate": 0.00016343173150105278, "loss": 0.0146, "step": 1410 }, { "epoch": 27.85148514851485, "grad_norm": 0.2266589254140854, "learning_rate": 0.0001629442579268016, "loss": 0.0144, "step": 1420 }, { "epoch": 28.03960396039604, "grad_norm": 0.06419487297534943, "learning_rate": 0.0001624542944437139, "loss": 0.0139, "step": 1430 }, { "epoch": 28.237623762376238, "grad_norm": 0.15811343491077423, "learning_rate": 0.00016196186043345288, "loss": 0.0136, "step": 1440 }, { "epoch": 28.435643564356436, "grad_norm": 0.15876223146915436, "learning_rate": 0.00016146697537540924, "loss": 0.0132, "step": 1450 }, { "epoch": 28.633663366336634, "grad_norm": 0.14463020861148834, "learning_rate": 0.0001609696588459307, "loss": 0.0129, "step": 1460 }, { "epoch": 28.831683168316832, "grad_norm": 0.2847503423690796, "learning_rate": 0.00016046993051754756, "loss": 0.0139, "step": 1470 }, { "epoch": 29.019801980198018, "grad_norm": 0.26266077160835266, "learning_rate": 0.0001599678101581945, "loss": 0.015, "step": 1480 }, { "epoch": 29.217821782178216, "grad_norm": 0.11117064952850342, "learning_rate": 0.00015946331763042867, "loss": 0.012, "step": 1490 }, { "epoch": 29.415841584158414, "grad_norm": 0.041199155151844025, "learning_rate": 0.00015895647289064396, "loss": 0.0113, "step": 1500 }, { "epoch": 29.415841584158414, "eval_loss": 1.8528623580932617, "eval_runtime": 75.9027, "eval_samples_per_second": 4.176, "eval_steps_per_second": 2.095, "step": 1500 }, { "epoch": 29.613861386138613, "grad_norm": 0.4097837507724762, "learning_rate": 0.0001584472959882815, "loss": 0.014, "step": 1510 }, { "epoch": 29.81188118811881, "grad_norm": 0.10821210592985153, "learning_rate": 0.0001579358070650367, "loss": 0.0129, "step": 1520 }, { "epoch": 30.0, "grad_norm": 0.11605525761842728, "learning_rate": 0.00015742202635406235, "loss": 0.0129, "step": 1530 }, { "epoch": 30.198019801980198, "grad_norm": 0.21412257850170135, "learning_rate": 0.0001569059741791684, "loss": 0.011, "step": 1540 }, { "epoch": 30.396039603960396, "grad_norm": 0.0895719975233078, "learning_rate": 0.0001563876709540178, "loss": 0.0129, "step": 1550 }, { "epoch": 30.594059405940595, "grad_norm": 0.18187634646892548, "learning_rate": 0.00015586713718131922, "loss": 0.0127, "step": 1560 }, { "epoch": 30.792079207920793, "grad_norm": 0.10705845803022385, "learning_rate": 0.0001553443934520159, "loss": 0.014, "step": 1570 }, { "epoch": 30.99009900990099, "grad_norm": 0.1223723515868187, "learning_rate": 0.00015481946044447099, "loss": 0.0136, "step": 1580 }, { "epoch": 31.178217821782177, "grad_norm": 0.046719685196876526, "learning_rate": 0.00015429235892364994, "loss": 0.0122, "step": 1590 }, { "epoch": 31.376237623762375, "grad_norm": 0.06917575001716614, "learning_rate": 0.00015376310974029873, "loss": 0.0112, "step": 1600 }, { "epoch": 31.574257425742573, "grad_norm": 0.045560333877801895, "learning_rate": 0.0001532317338301192, "loss": 0.0113, "step": 1610 }, { "epoch": 31.77227722772277, "grad_norm": 0.06091497838497162, "learning_rate": 0.00015269825221294098, "loss": 0.0117, "step": 1620 }, { "epoch": 31.97029702970297, "grad_norm": 0.07725334167480469, "learning_rate": 0.0001521626859918898, "loss": 0.0124, "step": 1630 }, { "epoch": 32.15841584158416, "grad_norm": 0.0351332388818264, "learning_rate": 0.00015162505635255287, "loss": 0.0106, "step": 1640 }, { "epoch": 32.35643564356435, "grad_norm": 0.0352192148566246, "learning_rate": 0.0001510853845621409, "loss": 0.0104, "step": 1650 }, { "epoch": 32.554455445544555, "grad_norm": 0.03620712086558342, "learning_rate": 0.00015054369196864644, "loss": 0.011, "step": 1660 }, { "epoch": 32.75247524752475, "grad_norm": 0.06429705768823624, "learning_rate": 0.00015000000000000001, "loss": 0.0116, "step": 1670 }, { "epoch": 32.95049504950495, "grad_norm": 0.03938416391611099, "learning_rate": 0.0001494543301632219, "loss": 0.0116, "step": 1680 }, { "epoch": 33.13861386138614, "grad_norm": 0.03617365285754204, "learning_rate": 0.0001489067040435717, "loss": 0.0109, "step": 1690 }, { "epoch": 33.336633663366335, "grad_norm": 0.09898613393306732, "learning_rate": 0.00014835714330369446, "loss": 0.0108, "step": 1700 }, { "epoch": 33.53465346534654, "grad_norm": 0.07661855220794678, "learning_rate": 0.0001478056696827636, "loss": 0.0115, "step": 1710 }, { "epoch": 33.73267326732673, "grad_norm": 0.0875670462846756, "learning_rate": 0.00014725230499562119, "loss": 0.0112, "step": 1720 }, { "epoch": 33.93069306930693, "grad_norm": 0.045731861144304276, "learning_rate": 0.00014669707113191483, "loss": 0.012, "step": 1730 }, { "epoch": 34.118811881188115, "grad_norm": 0.03665975108742714, "learning_rate": 0.00014613999005523174, "loss": 0.0108, "step": 1740 }, { "epoch": 34.31683168316832, "grad_norm": 0.07179899513721466, "learning_rate": 0.00014558108380223012, "loss": 0.0106, "step": 1750 }, { "epoch": 34.51485148514851, "grad_norm": 0.04096843674778938, "learning_rate": 0.00014502037448176734, "loss": 0.0112, "step": 1760 }, { "epoch": 34.71287128712871, "grad_norm": 0.03616851940751076, "learning_rate": 0.00014445788427402528, "loss": 0.0112, "step": 1770 }, { "epoch": 34.91089108910891, "grad_norm": 0.2000083476305008, "learning_rate": 0.00014389363542963306, "loss": 0.0119, "step": 1780 }, { "epoch": 35.0990099009901, "grad_norm": 0.03873177245259285, "learning_rate": 0.00014332765026878687, "loss": 0.0108, "step": 1790 }, { "epoch": 35.2970297029703, "grad_norm": 0.030958373099565506, "learning_rate": 0.00014275995118036693, "loss": 0.0099, "step": 1800 }, { "epoch": 35.495049504950494, "grad_norm": 0.03003741428256035, "learning_rate": 0.00014219056062105193, "loss": 0.01, "step": 1810 }, { "epoch": 35.693069306930695, "grad_norm": 0.03531872481107712, "learning_rate": 0.00014161950111443077, "loss": 0.0111, "step": 1820 }, { "epoch": 35.89108910891089, "grad_norm": 0.041126642376184464, "learning_rate": 0.0001410467952501114, "loss": 0.0115, "step": 1830 }, { "epoch": 36.07920792079208, "grad_norm": 0.028950873762369156, "learning_rate": 0.00014047246568282736, "loss": 0.0106, "step": 1840 }, { "epoch": 36.277227722772274, "grad_norm": 0.07969832420349121, "learning_rate": 0.00013989653513154165, "loss": 0.01, "step": 1850 }, { "epoch": 36.475247524752476, "grad_norm": 0.03239164501428604, "learning_rate": 0.0001393190263785479, "loss": 0.0104, "step": 1860 }, { "epoch": 36.67326732673267, "grad_norm": 0.088913194835186, "learning_rate": 0.00013873996226856933, "loss": 0.0112, "step": 1870 }, { "epoch": 36.87128712871287, "grad_norm": 0.34416264295578003, "learning_rate": 0.00013815936570785487, "loss": 0.0127, "step": 1880 }, { "epoch": 37.05940594059406, "grad_norm": 0.14783936738967896, "learning_rate": 0.00013757725966327322, "loss": 0.0119, "step": 1890 }, { "epoch": 37.257425742574256, "grad_norm": 0.09111394733190536, "learning_rate": 0.00013699366716140435, "loss": 0.0116, "step": 1900 }, { "epoch": 37.45544554455446, "grad_norm": 0.10149216651916504, "learning_rate": 0.0001364086112876284, "loss": 0.0113, "step": 1910 }, { "epoch": 37.65346534653465, "grad_norm": 0.058421943336725235, "learning_rate": 0.00013582211518521273, "loss": 0.0113, "step": 1920 }, { "epoch": 37.851485148514854, "grad_norm": 0.11640463024377823, "learning_rate": 0.00013523420205439646, "loss": 0.012, "step": 1930 }, { "epoch": 38.039603960396036, "grad_norm": 0.07603046298027039, "learning_rate": 0.00013464489515147238, "loss": 0.0114, "step": 1940 }, { "epoch": 38.23762376237624, "grad_norm": 0.03392176330089569, "learning_rate": 0.00013405421778786737, "loss": 0.0108, "step": 1950 }, { "epoch": 38.43564356435643, "grad_norm": 0.2335938662290573, "learning_rate": 0.00013346219332922016, "loss": 0.0108, "step": 1960 }, { "epoch": 38.633663366336634, "grad_norm": 0.07149430364370346, "learning_rate": 0.0001328688451944569, "loss": 0.0113, "step": 1970 }, { "epoch": 38.83168316831683, "grad_norm": 0.15134957432746887, "learning_rate": 0.00013227419685486492, "loss": 0.0121, "step": 1980 }, { "epoch": 39.01980198019802, "grad_norm": 0.031247029080986977, "learning_rate": 0.0001316782718331643, "loss": 0.011, "step": 1990 }, { "epoch": 39.21782178217822, "grad_norm": 0.03347298502922058, "learning_rate": 0.00013108109370257712, "loss": 0.011, "step": 2000 }, { "epoch": 39.21782178217822, "eval_loss": 1.9810045957565308, "eval_runtime": 76.3823, "eval_samples_per_second": 4.15, "eval_steps_per_second": 2.082, "step": 2000 }, { "epoch": 39.415841584158414, "grad_norm": 0.09714417904615402, "learning_rate": 0.00013048268608589533, "loss": 0.0116, "step": 2010 }, { "epoch": 39.613861386138616, "grad_norm": 0.05139007791876793, "learning_rate": 0.00012988307265454597, "loss": 0.0111, "step": 2020 }, { "epoch": 39.81188118811881, "grad_norm": 0.04015783220529556, "learning_rate": 0.00012928227712765504, "loss": 0.0117, "step": 2030 }, { "epoch": 40.0, "grad_norm": 0.20379693806171417, "learning_rate": 0.00012868032327110904, "loss": 0.0128, "step": 2040 }, { "epoch": 40.198019801980195, "grad_norm": 0.1887517124414444, "learning_rate": 0.00012807723489661495, "loss": 0.0119, "step": 2050 }, { "epoch": 40.396039603960396, "grad_norm": 0.0888703465461731, "learning_rate": 0.0001274730358607583, "loss": 0.0136, "step": 2060 }, { "epoch": 40.59405940594059, "grad_norm": 0.16852861642837524, "learning_rate": 0.00012686775006405946, "loss": 0.0124, "step": 2070 }, { "epoch": 40.79207920792079, "grad_norm": 0.0944506973028183, "learning_rate": 0.0001262614014500282, "loss": 0.0134, "step": 2080 }, { "epoch": 40.99009900990099, "grad_norm": 0.09365811944007874, "learning_rate": 0.00012565401400421651, "loss": 0.0137, "step": 2090 }, { "epoch": 41.17821782178218, "grad_norm": 0.15938587486743927, "learning_rate": 0.00012504561175326985, "loss": 0.0124, "step": 2100 }, { "epoch": 41.37623762376238, "grad_norm": 0.19702400267124176, "learning_rate": 0.0001244362187639767, "loss": 0.0135, "step": 2110 }, { "epoch": 41.57425742574257, "grad_norm": 0.21812361478805542, "learning_rate": 0.0001238258591423165, "loss": 0.0132, "step": 2120 }, { "epoch": 41.772277227722775, "grad_norm": 0.1647273451089859, "learning_rate": 0.00012321455703250616, "loss": 0.0121, "step": 2130 }, { "epoch": 41.97029702970297, "grad_norm": 0.047169461846351624, "learning_rate": 0.0001226023366160449, "loss": 0.0123, "step": 2140 }, { "epoch": 42.15841584158416, "grad_norm": 0.10308068245649338, "learning_rate": 0.00012198922211075778, "loss": 0.0113, "step": 2150 }, { "epoch": 42.35643564356435, "grad_norm": 0.0610012523829937, "learning_rate": 0.00012137523776983757, "loss": 0.0131, "step": 2160 }, { "epoch": 42.554455445544555, "grad_norm": 0.12856794893741608, "learning_rate": 0.00012076040788088554, "loss": 0.0116, "step": 2170 }, { "epoch": 42.75247524752475, "grad_norm": 0.09634555876255035, "learning_rate": 0.00012014475676495052, "loss": 0.0127, "step": 2180 }, { "epoch": 42.95049504950495, "grad_norm": 0.21406149864196777, "learning_rate": 0.000119528308775567, "loss": 0.0124, "step": 2190 }, { "epoch": 43.13861386138614, "grad_norm": 0.03979460150003433, "learning_rate": 0.00011891108829779165, "loss": 0.0116, "step": 2200 }, { "epoch": 43.336633663366335, "grad_norm": 0.05667963996529579, "learning_rate": 0.00011829311974723867, "loss": 0.0112, "step": 2210 }, { "epoch": 43.53465346534654, "grad_norm": 0.07588782906532288, "learning_rate": 0.00011767442756911417, "loss": 0.0114, "step": 2220 }, { "epoch": 43.73267326732673, "grad_norm": 0.06999919563531876, "learning_rate": 0.00011705503623724898, "loss": 0.0118, "step": 2230 }, { "epoch": 43.93069306930693, "grad_norm": 0.08128093928098679, "learning_rate": 0.00011643497025313061, "loss": 0.0118, "step": 2240 }, { "epoch": 44.118811881188115, "grad_norm": 0.03330959379673004, "learning_rate": 0.0001158142541449341, "loss": 0.0109, "step": 2250 }, { "epoch": 44.31683168316832, "grad_norm": 0.060158368200063705, "learning_rate": 0.0001151929124665516, "loss": 0.0109, "step": 2260 }, { "epoch": 44.51485148514851, "grad_norm": 0.1863548904657364, "learning_rate": 0.00011457096979662114, "loss": 0.011, "step": 2270 }, { "epoch": 44.71287128712871, "grad_norm": 0.03855551779270172, "learning_rate": 0.00011394845073755455, "loss": 0.0113, "step": 2280 }, { "epoch": 44.91089108910891, "grad_norm": 0.06027248874306679, "learning_rate": 0.00011332537991456398, "loss": 0.0111, "step": 2290 }, { "epoch": 45.0990099009901, "grad_norm": 0.1333802491426468, "learning_rate": 0.00011270178197468789, "loss": 0.0112, "step": 2300 }, { "epoch": 45.2970297029703, "grad_norm": 0.07712238281965256, "learning_rate": 0.00011207768158581613, "loss": 0.0099, "step": 2310 }, { "epoch": 45.495049504950494, "grad_norm": 0.03724438324570656, "learning_rate": 0.00011145310343571411, "loss": 0.011, "step": 2320 }, { "epoch": 45.693069306930695, "grad_norm": 0.0400845967233181, "learning_rate": 0.0001108280722310462, "loss": 0.011, "step": 2330 }, { "epoch": 45.89108910891089, "grad_norm": 0.041330672800540924, "learning_rate": 0.00011020261269639842, "loss": 0.0108, "step": 2340 }, { "epoch": 46.07920792079208, "grad_norm": 0.03175568953156471, "learning_rate": 0.00010957674957330042, "loss": 0.0102, "step": 2350 }, { "epoch": 46.277227722772274, "grad_norm": 0.027505742385983467, "learning_rate": 0.00010895050761924668, "loss": 0.0098, "step": 2360 }, { "epoch": 46.475247524752476, "grad_norm": 0.031385671347379684, "learning_rate": 0.00010832391160671729, "loss": 0.0103, "step": 2370 }, { "epoch": 46.67326732673267, "grad_norm": 0.03100833296775818, "learning_rate": 0.00010769698632219794, "loss": 0.0101, "step": 2380 }, { "epoch": 46.87128712871287, "grad_norm": 0.04194655641913414, "learning_rate": 0.00010706975656519946, "loss": 0.0105, "step": 2390 }, { "epoch": 47.05940594059406, "grad_norm": 0.17276820540428162, "learning_rate": 0.00010644224714727681, "loss": 0.0105, "step": 2400 }, { "epoch": 47.257425742574256, "grad_norm": 0.032278869301080704, "learning_rate": 0.00010581448289104758, "loss": 0.0094, "step": 2410 }, { "epoch": 47.45544554455446, "grad_norm": 0.03324766084551811, "learning_rate": 0.00010518648862921012, "loss": 0.01, "step": 2420 }, { "epoch": 47.65346534653465, "grad_norm": 0.03345724940299988, "learning_rate": 0.00010455828920356115, "loss": 0.0103, "step": 2430 }, { "epoch": 47.851485148514854, "grad_norm": 0.034674759954214096, "learning_rate": 0.00010392990946401313, "loss": 0.0105, "step": 2440 }, { "epoch": 48.039603960396036, "grad_norm": 0.024850964546203613, "learning_rate": 0.00010330137426761135, "loss": 0.0109, "step": 2450 }, { "epoch": 48.23762376237624, "grad_norm": 0.0305289626121521, "learning_rate": 0.00010267270847755048, "loss": 0.0096, "step": 2460 }, { "epoch": 48.43564356435643, "grad_norm": 0.026470104232430458, "learning_rate": 0.00010204393696219117, "loss": 0.0099, "step": 2470 }, { "epoch": 48.633663366336634, "grad_norm": 0.035170264542102814, "learning_rate": 0.00010141508459407623, "loss": 0.0105, "step": 2480 }, { "epoch": 48.83168316831683, "grad_norm": 0.03382629156112671, "learning_rate": 0.00010078617624894684, "loss": 0.0107, "step": 2490 }, { "epoch": 49.01980198019802, "grad_norm": 0.02712225168943405, "learning_rate": 0.00010015723680475846, "loss": 0.0104, "step": 2500 }, { "epoch": 49.01980198019802, "eval_loss": 2.0715720653533936, "eval_runtime": 61.2747, "eval_samples_per_second": 5.173, "eval_steps_per_second": 2.595, "step": 2500 }, { "epoch": 49.21782178217822, "grad_norm": 0.2509881854057312, "learning_rate": 9.95282911406968e-05, "loss": 0.0105, "step": 2510 }, { "epoch": 49.415841584158414, "grad_norm": 0.03043426387012005, "learning_rate": 9.889936413619356e-05, "loss": 0.0098, "step": 2520 }, { "epoch": 49.613861386138616, "grad_norm": 0.03412380814552307, "learning_rate": 9.827048066994225e-05, "loss": 0.0115, "step": 2530 }, { "epoch": 49.81188118811881, "grad_norm": 0.20148912072181702, "learning_rate": 9.764166561891432e-05, "loss": 0.0116, "step": 2540 }, { "epoch": 50.0, "grad_norm": 0.26765531301498413, "learning_rate": 9.70129438573747e-05, "loss": 0.0118, "step": 2550 }, { "epoch": 50.198019801980195, "grad_norm": 0.0966796949505806, "learning_rate": 9.63843402558981e-05, "loss": 0.0111, "step": 2560 }, { "epoch": 50.396039603960396, "grad_norm": 0.11040028184652328, "learning_rate": 9.57558796803852e-05, "loss": 0.01, "step": 2570 }, { "epoch": 50.59405940594059, "grad_norm": 0.1609763503074646, "learning_rate": 9.512758699107879e-05, "loss": 0.011, "step": 2580 }, { "epoch": 50.79207920792079, "grad_norm": 0.07201898843050003, "learning_rate": 9.449948704158071e-05, "loss": 0.0106, "step": 2590 }, { "epoch": 50.99009900990099, "grad_norm": 0.06254687905311584, "learning_rate": 9.38716046778684e-05, "loss": 0.011, "step": 2600 }, { "epoch": 51.17821782178218, "grad_norm": 0.03031199239194393, "learning_rate": 9.324396473731217e-05, "loss": 0.0103, "step": 2610 }, { "epoch": 51.37623762376238, "grad_norm": 0.05816170200705528, "learning_rate": 9.261659204769284e-05, "loss": 0.0098, "step": 2620 }, { "epoch": 51.57425742574257, "grad_norm": 0.14442995190620422, "learning_rate": 9.198951142621929e-05, "loss": 0.0107, "step": 2630 }, { "epoch": 51.772277227722775, "grad_norm": 0.10595759749412537, "learning_rate": 9.136274767854716e-05, "loss": 0.0105, "step": 2640 }, { "epoch": 51.97029702970297, "grad_norm": 0.029600875452160835, "learning_rate": 9.07363255977973e-05, "loss": 0.0108, "step": 2650 }, { "epoch": 52.15841584158416, "grad_norm": 0.08049221336841583, "learning_rate": 9.011026996357503e-05, "loss": 0.0097, "step": 2660 }, { "epoch": 52.35643564356435, "grad_norm": 0.026186183094978333, "learning_rate": 8.948460554099018e-05, "loss": 0.0103, "step": 2670 }, { "epoch": 52.554455445544555, "grad_norm": 0.17796003818511963, "learning_rate": 8.885935707967716e-05, "loss": 0.0108, "step": 2680 }, { "epoch": 52.75247524752475, "grad_norm": 0.09132158011198044, "learning_rate": 8.823454931281616e-05, "loss": 0.0105, "step": 2690 }, { "epoch": 52.95049504950495, "grad_norm": 0.028834670782089233, "learning_rate": 8.76102069561545e-05, "loss": 0.0113, "step": 2700 }, { "epoch": 53.13861386138614, "grad_norm": 0.21559055149555206, "learning_rate": 8.698635470702923e-05, "loss": 0.0145, "step": 2710 }, { "epoch": 53.336633663366335, "grad_norm": 0.1729169338941574, "learning_rate": 8.636301724339004e-05, "loss": 0.0123, "step": 2720 }, { "epoch": 53.53465346534654, "grad_norm": 0.05573170632123947, "learning_rate": 8.574021922282292e-05, "loss": 0.0109, "step": 2730 }, { "epoch": 53.73267326732673, "grad_norm": 0.033353183418512344, "learning_rate": 8.511798528157512e-05, "loss": 0.011, "step": 2740 }, { "epoch": 53.93069306930693, "grad_norm": 0.03874226659536362, "learning_rate": 8.449634003358022e-05, "loss": 0.0116, "step": 2750 }, { "epoch": 54.118811881188115, "grad_norm": 0.029603777453303337, "learning_rate": 8.387530806948476e-05, "loss": 0.0101, "step": 2760 }, { "epoch": 54.31683168316832, "grad_norm": 0.11726613342761993, "learning_rate": 8.325491395567541e-05, "loss": 0.0109, "step": 2770 }, { "epoch": 54.51485148514851, "grad_norm": 0.04111700505018234, "learning_rate": 8.263518223330697e-05, "loss": 0.0106, "step": 2780 }, { "epoch": 54.71287128712871, "grad_norm": 0.06498590111732483, "learning_rate": 8.201613741733203e-05, "loss": 0.0115, "step": 2790 }, { "epoch": 54.91089108910891, "grad_norm": 0.03625292330980301, "learning_rate": 8.13978039955308e-05, "loss": 0.0145, "step": 2800 }, { "epoch": 55.0990099009901, "grad_norm": 0.10530784726142883, "learning_rate": 8.078020642754274e-05, "loss": 0.0115, "step": 2810 }, { "epoch": 55.2970297029703, "grad_norm": 0.059211425483226776, "learning_rate": 8.016336914389874e-05, "loss": 0.0098, "step": 2820 }, { "epoch": 55.495049504950494, "grad_norm": 0.029871392995119095, "learning_rate": 7.954731654505491e-05, "loss": 0.0105, "step": 2830 }, { "epoch": 55.693069306930695, "grad_norm": 0.034974247217178345, "learning_rate": 7.89320730004274e-05, "loss": 0.0104, "step": 2840 }, { "epoch": 55.89108910891089, "grad_norm": 0.04234467074275017, "learning_rate": 7.831766284742807e-05, "loss": 0.012, "step": 2850 }, { "epoch": 56.07920792079208, "grad_norm": 0.02388872392475605, "learning_rate": 7.77041103905023e-05, "loss": 0.0104, "step": 2860 }, { "epoch": 56.277227722772274, "grad_norm": 0.03674217313528061, "learning_rate": 7.709143990016702e-05, "loss": 0.0099, "step": 2870 }, { "epoch": 56.475247524752476, "grad_norm": 0.028430873528122902, "learning_rate": 7.6479675612051e-05, "loss": 0.0102, "step": 2880 }, { "epoch": 56.67326732673267, "grad_norm": 0.034957416355609894, "learning_rate": 7.586884172593609e-05, "loss": 0.0101, "step": 2890 }, { "epoch": 56.87128712871287, "grad_norm": 0.03257445991039276, "learning_rate": 7.525896240479976e-05, "loss": 0.0102, "step": 2900 }, { "epoch": 57.05940594059406, "grad_norm": 0.025701675564050674, "learning_rate": 7.465006177385953e-05, "loss": 0.0101, "step": 2910 }, { "epoch": 57.257425742574256, "grad_norm": 0.02833356149494648, "learning_rate": 7.404216391961847e-05, "loss": 0.0095, "step": 2920 }, { "epoch": 57.45544554455446, "grad_norm": 0.02513338252902031, "learning_rate": 7.343529288891239e-05, "loss": 0.0096, "step": 2930 }, { "epoch": 57.65346534653465, "grad_norm": 0.02715817466378212, "learning_rate": 7.282947268795877e-05, "loss": 0.01, "step": 2940 }, { "epoch": 57.851485148514854, "grad_norm": 0.030358904972672462, "learning_rate": 7.222472728140695e-05, "loss": 0.0101, "step": 2950 }, { "epoch": 58.039603960396036, "grad_norm": 0.02921466901898384, "learning_rate": 7.162108059139032e-05, "loss": 0.01, "step": 2960 }, { "epoch": 58.23762376237624, "grad_norm": 0.025500759482383728, "learning_rate": 7.101855649657991e-05, "loss": 0.0093, "step": 2970 }, { "epoch": 58.43564356435643, "grad_norm": 0.03553071618080139, "learning_rate": 7.041717883123977e-05, "loss": 0.0095, "step": 2980 }, { "epoch": 58.633663366336634, "grad_norm": 0.03326829522848129, "learning_rate": 6.981697138428434e-05, "loss": 0.0097, "step": 2990 }, { "epoch": 58.83168316831683, "grad_norm": 0.03296545520424843, "learning_rate": 6.921795789833723e-05, "loss": 0.0102, "step": 3000 }, { "epoch": 58.83168316831683, "eval_loss": 2.1854326725006104, "eval_runtime": 63.8492, "eval_samples_per_second": 4.965, "eval_steps_per_second": 2.49, "step": 3000 }, { "epoch": 59.01980198019802, "grad_norm": 0.03292296826839447, "learning_rate": 6.862016206879216e-05, "loss": 0.0101, "step": 3010 }, { "epoch": 59.21782178217822, "grad_norm": 0.03038034774363041, "learning_rate": 6.802360754287547e-05, "loss": 0.0092, "step": 3020 }, { "epoch": 59.415841584158414, "grad_norm": 0.031039321795105934, "learning_rate": 6.742831791871096e-05, "loss": 0.0095, "step": 3030 }, { "epoch": 59.613861386138616, "grad_norm": 0.034637823700904846, "learning_rate": 6.683431674438612e-05, "loss": 0.0096, "step": 3040 }, { "epoch": 59.81188118811881, "grad_norm": 0.034620340913534164, "learning_rate": 6.624162751702076e-05, "loss": 0.01, "step": 3050 }, { "epoch": 60.0, "grad_norm": 0.04842129349708557, "learning_rate": 6.565027368183769e-05, "loss": 0.0102, "step": 3060 }, { "epoch": 60.198019801980195, "grad_norm": 0.027831487357616425, "learning_rate": 6.506027863123492e-05, "loss": 0.0092, "step": 3070 }, { "epoch": 60.396039603960396, "grad_norm": 0.028759174048900604, "learning_rate": 6.447166570386063e-05, "loss": 0.0095, "step": 3080 }, { "epoch": 60.59405940594059, "grad_norm": 0.030392073094844818, "learning_rate": 6.388445818368991e-05, "loss": 0.0096, "step": 3090 }, { "epoch": 60.79207920792079, "grad_norm": 0.0333111509680748, "learning_rate": 6.329867929910347e-05, "loss": 0.01, "step": 3100 }, { "epoch": 60.99009900990099, "grad_norm": 0.030943069607019424, "learning_rate": 6.271435222196916e-05, "loss": 0.0099, "step": 3110 }, { "epoch": 61.17821782178218, "grad_norm": 0.027012720704078674, "learning_rate": 6.213150006672499e-05, "loss": 0.0093, "step": 3120 }, { "epoch": 61.37623762376238, "grad_norm": 0.02961895428597927, "learning_rate": 6.15501458894651e-05, "loss": 0.0094, "step": 3130 }, { "epoch": 61.57425742574257, "grad_norm": 0.02908298186957836, "learning_rate": 6.097031268702746e-05, "loss": 0.0095, "step": 3140 }, { "epoch": 61.772277227722775, "grad_norm": 0.03335884213447571, "learning_rate": 6.039202339608432e-05, "loss": 0.0098, "step": 3150 }, { "epoch": 61.97029702970297, "grad_norm": 0.03192156180739403, "learning_rate": 5.981530089223489e-05, "loss": 0.01, "step": 3160 }, { "epoch": 62.15841584158416, "grad_norm": 0.02803504653275013, "learning_rate": 5.924016798910037e-05, "loss": 0.0094, "step": 3170 }, { "epoch": 62.35643564356435, "grad_norm": 0.03200392797589302, "learning_rate": 5.866664743742162e-05, "loss": 0.0093, "step": 3180 }, { "epoch": 62.554455445544555, "grad_norm": 0.03177599608898163, "learning_rate": 5.809476192415905e-05, "loss": 0.0097, "step": 3190 }, { "epoch": 62.75247524752475, "grad_norm": 0.02948221005499363, "learning_rate": 5.752453407159522e-05, "loss": 0.0098, "step": 3200 }, { "epoch": 62.95049504950495, "grad_norm": 0.02995140664279461, "learning_rate": 5.69559864364402e-05, "loss": 0.0099, "step": 3210 }, { "epoch": 63.13861386138614, "grad_norm": 0.029279688373208046, "learning_rate": 5.6389141508938903e-05, "loss": 0.0092, "step": 3220 }, { "epoch": 63.336633663366335, "grad_norm": 0.025498950853943825, "learning_rate": 5.5824021711981686e-05, "loss": 0.0093, "step": 3230 }, { "epoch": 63.53465346534654, "grad_norm": 0.03137169033288956, "learning_rate": 5.5260649400217326e-05, "loss": 0.0095, "step": 3240 }, { "epoch": 63.73267326732673, "grad_norm": 0.028380287811160088, "learning_rate": 5.469904685916861e-05, "loss": 0.0098, "step": 3250 }, { "epoch": 63.93069306930693, "grad_norm": 0.031060099601745605, "learning_rate": 5.4139236304350935e-05, "loss": 0.0099, "step": 3260 }, { "epoch": 64.11881188118812, "grad_norm": 0.025437017902731895, "learning_rate": 5.3581239880393375e-05, "loss": 0.0094, "step": 3270 }, { "epoch": 64.31683168316832, "grad_norm": 0.025531411170959473, "learning_rate": 5.302507966016295e-05, "loss": 0.0091, "step": 3280 }, { "epoch": 64.51485148514851, "grad_norm": 0.03445703908801079, "learning_rate": 5.247077764389099e-05, "loss": 0.0095, "step": 3290 }, { "epoch": 64.7128712871287, "grad_norm": 0.03112807497382164, "learning_rate": 5.191835575830352e-05, "loss": 0.0098, "step": 3300 }, { "epoch": 64.91089108910892, "grad_norm": 0.032531820237636566, "learning_rate": 5.136783585575336e-05, "loss": 0.0099, "step": 3310 }, { "epoch": 65.0990099009901, "grad_norm": 0.028146883472800255, "learning_rate": 5.081923971335582e-05, "loss": 0.0095, "step": 3320 }, { "epoch": 65.29702970297029, "grad_norm": 0.03265933692455292, "learning_rate": 5.0272589032127594e-05, "loss": 0.0091, "step": 3330 }, { "epoch": 65.4950495049505, "grad_norm": 0.026051949709653854, "learning_rate": 4.972790543612783e-05, "loss": 0.0094, "step": 3340 }, { "epoch": 65.6930693069307, "grad_norm": 0.031889524310827255, "learning_rate": 4.918521047160308e-05, "loss": 0.0096, "step": 3350 }, { "epoch": 65.89108910891089, "grad_norm": 0.032141778618097305, "learning_rate": 4.864452560613485e-05, "loss": 0.0098, "step": 3360 }, { "epoch": 66.07920792079207, "grad_norm": 0.026572005823254585, "learning_rate": 4.810587222779043e-05, "loss": 0.0096, "step": 3370 }, { "epoch": 66.27722772277228, "grad_norm": 0.026687586680054665, "learning_rate": 4.756927164427685e-05, "loss": 0.0091, "step": 3380 }, { "epoch": 66.47524752475248, "grad_norm": 0.029472634196281433, "learning_rate": 4.703474508209793e-05, "loss": 0.0093, "step": 3390 }, { "epoch": 66.67326732673267, "grad_norm": 0.03171619027853012, "learning_rate": 4.650231368571486e-05, "loss": 0.0096, "step": 3400 }, { "epoch": 66.87128712871286, "grad_norm": 0.02731896936893463, "learning_rate": 4.597199851670932e-05, "loss": 0.0098, "step": 3410 }, { "epoch": 67.05940594059406, "grad_norm": 0.028085224330425262, "learning_rate": 4.54438205529508e-05, "loss": 0.0095, "step": 3420 }, { "epoch": 67.25742574257426, "grad_norm": 0.033895280212163925, "learning_rate": 4.491780068776663e-05, "loss": 0.0092, "step": 3430 }, { "epoch": 67.45544554455445, "grad_norm": 0.030533580109477043, "learning_rate": 4.4393959729115244e-05, "loss": 0.0094, "step": 3440 }, { "epoch": 67.65346534653466, "grad_norm": 0.032808952033519745, "learning_rate": 4.387231839876349e-05, "loss": 0.0095, "step": 3450 }, { "epoch": 67.85148514851485, "grad_norm": 0.02919154055416584, "learning_rate": 4.335289733146665e-05, "loss": 0.0097, "step": 3460 }, { "epoch": 68.03960396039604, "grad_norm": 0.02805473655462265, "learning_rate": 4.283571707415214e-05, "loss": 0.0098, "step": 3470 }, { "epoch": 68.23762376237623, "grad_norm": 0.02642083168029785, "learning_rate": 4.2320798085107036e-05, "loss": 0.0091, "step": 3480 }, { "epoch": 68.43564356435644, "grad_norm": 0.02746916376054287, "learning_rate": 4.18081607331685e-05, "loss": 0.0092, "step": 3490 }, { "epoch": 68.63366336633663, "grad_norm": 0.030231185257434845, "learning_rate": 4.129782529691815e-05, "loss": 0.0095, "step": 3500 }, { "epoch": 68.63366336633663, "eval_loss": 2.408777952194214, "eval_runtime": 62.4857, "eval_samples_per_second": 5.073, "eval_steps_per_second": 2.545, "step": 3500 } ], "logging_steps": 10, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.0035978035769508e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }