{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 9690, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002063983488132095, "grad_norm": 21.791844475218177, "learning_rate": 1.718213058419244e-08, "loss": 1.2399, "step": 1 }, { "epoch": 0.010319917440660475, "grad_norm": 2.758771436717397, "learning_rate": 8.59106529209622e-07, "loss": 1.0835, "step": 50 }, { "epoch": 0.02063983488132095, "grad_norm": 1.6583275647715192, "learning_rate": 1.718213058419244e-06, "loss": 0.8281, "step": 100 }, { "epoch": 0.030959752321981424, "grad_norm": 1.052445004470504, "learning_rate": 2.577319587628866e-06, "loss": 0.7456, "step": 150 }, { "epoch": 0.0412796697626419, "grad_norm": 0.6644727438734609, "learning_rate": 3.436426116838488e-06, "loss": 0.6981, "step": 200 }, { "epoch": 0.05159958720330237, "grad_norm": 0.6511384641685941, "learning_rate": 4.2955326460481105e-06, "loss": 0.6756, "step": 250 }, { "epoch": 0.06191950464396285, "grad_norm": 0.6728510847115857, "learning_rate": 4.995212256623046e-06, "loss": 0.669, "step": 300 }, { "epoch": 0.07223942208462332, "grad_norm": 0.6002063765996374, "learning_rate": 4.968613682306629e-06, "loss": 0.6488, "step": 350 }, { "epoch": 0.0825593395252838, "grad_norm": 0.5807563716846417, "learning_rate": 4.942015107990212e-06, "loss": 0.641, "step": 400 }, { "epoch": 0.09287925696594428, "grad_norm": 0.5651403042158113, "learning_rate": 4.915416533673795e-06, "loss": 0.6347, "step": 450 }, { "epoch": 0.10319917440660474, "grad_norm": 0.5556237524713133, "learning_rate": 4.888817959357379e-06, "loss": 0.6292, "step": 500 }, { "epoch": 0.11351909184726522, "grad_norm": 0.5537920539892396, "learning_rate": 4.862219385040962e-06, "loss": 0.6207, "step": 550 }, { "epoch": 0.1238390092879257, "grad_norm": 0.6516923682179434, "learning_rate": 4.835620810724546e-06, "loss": 0.6221, "step": 600 }, { "epoch": 0.13415892672858618, "grad_norm": 0.5993983563444428, "learning_rate": 4.809022236408129e-06, "loss": 0.6134, "step": 650 }, { "epoch": 0.14447884416924664, "grad_norm": 0.5382978582953953, "learning_rate": 4.782423662091712e-06, "loss": 0.6142, "step": 700 }, { "epoch": 0.15479876160990713, "grad_norm": 0.5360001817415995, "learning_rate": 4.7558250877752955e-06, "loss": 0.6064, "step": 750 }, { "epoch": 0.1651186790505676, "grad_norm": 0.5196308091067839, "learning_rate": 4.729226513458879e-06, "loss": 0.5988, "step": 800 }, { "epoch": 0.17543859649122806, "grad_norm": 0.5427506989599503, "learning_rate": 4.702627939142462e-06, "loss": 0.6025, "step": 850 }, { "epoch": 0.18575851393188855, "grad_norm": 0.5521424095703439, "learning_rate": 4.676029364826046e-06, "loss": 0.597, "step": 900 }, { "epoch": 0.19607843137254902, "grad_norm": 0.6345336850282564, "learning_rate": 4.649430790509629e-06, "loss": 0.5887, "step": 950 }, { "epoch": 0.20639834881320948, "grad_norm": 0.5971258034612797, "learning_rate": 4.6228322161932124e-06, "loss": 0.5815, "step": 1000 }, { "epoch": 0.21671826625386997, "grad_norm": 0.5438942448938008, "learning_rate": 4.596233641876796e-06, "loss": 0.5805, "step": 1050 }, { "epoch": 0.22703818369453044, "grad_norm": 0.5334432938197203, "learning_rate": 4.569635067560379e-06, "loss": 0.5738, "step": 1100 }, { "epoch": 0.23735810113519093, "grad_norm": 0.5347450534004635, "learning_rate": 4.543036493243962e-06, "loss": 0.5801, "step": 1150 }, { "epoch": 0.2476780185758514, "grad_norm": 0.5873203374469886, "learning_rate": 4.516437918927546e-06, "loss": 0.5834, "step": 1200 }, { "epoch": 0.2579979360165119, "grad_norm": 0.5240586732453324, "learning_rate": 4.489839344611129e-06, "loss": 0.5721, "step": 1250 }, { "epoch": 0.26831785345717235, "grad_norm": 0.5575825059292604, "learning_rate": 4.463240770294713e-06, "loss": 0.5779, "step": 1300 }, { "epoch": 0.2786377708978328, "grad_norm": 0.5561922023755285, "learning_rate": 4.436642195978296e-06, "loss": 0.5739, "step": 1350 }, { "epoch": 0.2889576883384933, "grad_norm": 0.5214056422410267, "learning_rate": 4.410043621661879e-06, "loss": 0.5593, "step": 1400 }, { "epoch": 0.29927760577915374, "grad_norm": 0.5509795794922944, "learning_rate": 4.383445047345462e-06, "loss": 0.563, "step": 1450 }, { "epoch": 0.30959752321981426, "grad_norm": 0.5698665633020146, "learning_rate": 4.356846473029046e-06, "loss": 0.5627, "step": 1500 }, { "epoch": 0.31991744066047473, "grad_norm": 0.5245577515841476, "learning_rate": 4.3302478987126295e-06, "loss": 0.5678, "step": 1550 }, { "epoch": 0.3302373581011352, "grad_norm": 0.6001876556912392, "learning_rate": 4.303649324396213e-06, "loss": 0.5709, "step": 1600 }, { "epoch": 0.34055727554179566, "grad_norm": 0.5333018384868324, "learning_rate": 4.277050750079796e-06, "loss": 0.5581, "step": 1650 }, { "epoch": 0.3508771929824561, "grad_norm": 0.5133912524315866, "learning_rate": 4.250452175763379e-06, "loss": 0.5618, "step": 1700 }, { "epoch": 0.36119711042311664, "grad_norm": 0.5077938795371925, "learning_rate": 4.223853601446962e-06, "loss": 0.5573, "step": 1750 }, { "epoch": 0.3715170278637771, "grad_norm": 0.5463753688650598, "learning_rate": 4.1972550271305465e-06, "loss": 0.5539, "step": 1800 }, { "epoch": 0.38183694530443757, "grad_norm": 0.5787935899358873, "learning_rate": 4.17065645281413e-06, "loss": 0.5483, "step": 1850 }, { "epoch": 0.39215686274509803, "grad_norm": 0.5654912899054569, "learning_rate": 4.144057878497713e-06, "loss": 0.558, "step": 1900 }, { "epoch": 0.4024767801857585, "grad_norm": 0.4996009494084185, "learning_rate": 4.117459304181296e-06, "loss": 0.5456, "step": 1950 }, { "epoch": 0.41279669762641896, "grad_norm": 0.5667395400639976, "learning_rate": 4.090860729864879e-06, "loss": 0.5486, "step": 2000 }, { "epoch": 0.4231166150670795, "grad_norm": 0.5188779450810256, "learning_rate": 4.0642621555484625e-06, "loss": 0.5464, "step": 2050 }, { "epoch": 0.43343653250773995, "grad_norm": 0.5362183280913333, "learning_rate": 4.037663581232047e-06, "loss": 0.5521, "step": 2100 }, { "epoch": 0.4437564499484004, "grad_norm": 0.5473955026723762, "learning_rate": 4.01106500691563e-06, "loss": 0.5445, "step": 2150 }, { "epoch": 0.4540763673890609, "grad_norm": 0.5397425583264429, "learning_rate": 3.984466432599213e-06, "loss": 0.5585, "step": 2200 }, { "epoch": 0.46439628482972134, "grad_norm": 0.5821347601023747, "learning_rate": 3.957867858282796e-06, "loss": 0.5513, "step": 2250 }, { "epoch": 0.47471620227038186, "grad_norm": 0.5167859554911757, "learning_rate": 3.9312692839663795e-06, "loss": 0.5401, "step": 2300 }, { "epoch": 0.4850361197110423, "grad_norm": 0.5353436940913985, "learning_rate": 3.904670709649963e-06, "loss": 0.5413, "step": 2350 }, { "epoch": 0.4953560371517028, "grad_norm": 0.5450597051287799, "learning_rate": 3.878072135333547e-06, "loss": 0.5377, "step": 2400 }, { "epoch": 0.5056759545923633, "grad_norm": 0.5133718526538825, "learning_rate": 3.85147356101713e-06, "loss": 0.5411, "step": 2450 }, { "epoch": 0.5159958720330238, "grad_norm": 0.5194969527334257, "learning_rate": 3.824874986700713e-06, "loss": 0.5504, "step": 2500 }, { "epoch": 0.5263157894736842, "grad_norm": 0.5244000570689482, "learning_rate": 3.7982764123842964e-06, "loss": 0.5373, "step": 2550 }, { "epoch": 0.5366357069143447, "grad_norm": 0.4954661200186007, "learning_rate": 3.7716778380678796e-06, "loss": 0.5321, "step": 2600 }, { "epoch": 0.5469556243550051, "grad_norm": 0.507806040946836, "learning_rate": 3.7450792637514633e-06, "loss": 0.5363, "step": 2650 }, { "epoch": 0.5572755417956656, "grad_norm": 0.5217141184924292, "learning_rate": 3.7184806894350465e-06, "loss": 0.5374, "step": 2700 }, { "epoch": 0.5675954592363261, "grad_norm": 1.0654133163473936, "learning_rate": 3.69188211511863e-06, "loss": 0.5387, "step": 2750 }, { "epoch": 0.5779153766769866, "grad_norm": 0.5300385445449849, "learning_rate": 3.665283540802213e-06, "loss": 0.5328, "step": 2800 }, { "epoch": 0.5882352941176471, "grad_norm": 0.5195789655014079, "learning_rate": 3.6386849664857966e-06, "loss": 0.5448, "step": 2850 }, { "epoch": 0.5985552115583075, "grad_norm": 0.5400701284508418, "learning_rate": 3.6120863921693798e-06, "loss": 0.5225, "step": 2900 }, { "epoch": 0.608875128998968, "grad_norm": 0.5192375803111897, "learning_rate": 3.5854878178529634e-06, "loss": 0.5354, "step": 2950 }, { "epoch": 0.6191950464396285, "grad_norm": 0.5157981546008934, "learning_rate": 3.558889243536547e-06, "loss": 0.5234, "step": 3000 }, { "epoch": 0.6295149638802889, "grad_norm": 0.5189888311937375, "learning_rate": 3.5322906692201303e-06, "loss": 0.5261, "step": 3050 }, { "epoch": 0.6398348813209495, "grad_norm": 0.520981935459507, "learning_rate": 3.505692094903713e-06, "loss": 0.5277, "step": 3100 }, { "epoch": 0.6501547987616099, "grad_norm": 0.5385057842452714, "learning_rate": 3.4790935205872967e-06, "loss": 0.5337, "step": 3150 }, { "epoch": 0.6604747162022704, "grad_norm": 0.5090039339282304, "learning_rate": 3.45249494627088e-06, "loss": 0.5246, "step": 3200 }, { "epoch": 0.6707946336429309, "grad_norm": 0.5309810767744108, "learning_rate": 3.4258963719544636e-06, "loss": 0.532, "step": 3250 }, { "epoch": 0.6811145510835913, "grad_norm": 0.5117993267505484, "learning_rate": 3.3992977976380472e-06, "loss": 0.5329, "step": 3300 }, { "epoch": 0.6914344685242518, "grad_norm": 0.542882178286457, "learning_rate": 3.3726992233216304e-06, "loss": 0.5262, "step": 3350 }, { "epoch": 0.7017543859649122, "grad_norm": 0.5485246009385575, "learning_rate": 3.3461006490052132e-06, "loss": 0.5288, "step": 3400 }, { "epoch": 0.7120743034055728, "grad_norm": 0.5245153157581836, "learning_rate": 3.319502074688797e-06, "loss": 0.5287, "step": 3450 }, { "epoch": 0.7223942208462333, "grad_norm": 0.5408996443293894, "learning_rate": 3.29290350037238e-06, "loss": 0.5325, "step": 3500 }, { "epoch": 0.7327141382868937, "grad_norm": 0.539947013464035, "learning_rate": 3.2663049260559637e-06, "loss": 0.523, "step": 3550 }, { "epoch": 0.7430340557275542, "grad_norm": 0.5322841773244653, "learning_rate": 3.2397063517395474e-06, "loss": 0.5226, "step": 3600 }, { "epoch": 0.7533539731682146, "grad_norm": 0.5150328597209529, "learning_rate": 3.2131077774231306e-06, "loss": 0.5182, "step": 3650 }, { "epoch": 0.7636738906088751, "grad_norm": 0.5187564835457917, "learning_rate": 3.1865092031067134e-06, "loss": 0.5226, "step": 3700 }, { "epoch": 0.7739938080495357, "grad_norm": 0.5376206106185862, "learning_rate": 3.159910628790297e-06, "loss": 0.5221, "step": 3750 }, { "epoch": 0.7843137254901961, "grad_norm": 0.4886509664687733, "learning_rate": 3.1333120544738802e-06, "loss": 0.5167, "step": 3800 }, { "epoch": 0.7946336429308566, "grad_norm": 0.5186897778877932, "learning_rate": 3.106713480157464e-06, "loss": 0.5215, "step": 3850 }, { "epoch": 0.804953560371517, "grad_norm": 0.5662670913922451, "learning_rate": 3.0801149058410475e-06, "loss": 0.5156, "step": 3900 }, { "epoch": 0.8152734778121775, "grad_norm": 0.5068699760497294, "learning_rate": 3.0535163315246307e-06, "loss": 0.5117, "step": 3950 }, { "epoch": 0.8255933952528379, "grad_norm": 0.525153451451465, "learning_rate": 3.0269177572082135e-06, "loss": 0.5284, "step": 4000 }, { "epoch": 0.8359133126934984, "grad_norm": 0.5415917077347273, "learning_rate": 3.000319182891797e-06, "loss": 0.5233, "step": 4050 }, { "epoch": 0.846233230134159, "grad_norm": 0.5122355304442054, "learning_rate": 2.9737206085753804e-06, "loss": 0.52, "step": 4100 }, { "epoch": 0.8565531475748194, "grad_norm": 0.5084493187015482, "learning_rate": 2.947122034258964e-06, "loss": 0.5122, "step": 4150 }, { "epoch": 0.8668730650154799, "grad_norm": 0.5222318358455478, "learning_rate": 2.9205234599425477e-06, "loss": 0.5164, "step": 4200 }, { "epoch": 0.8771929824561403, "grad_norm": 0.4952100113115859, "learning_rate": 2.893924885626131e-06, "loss": 0.5184, "step": 4250 }, { "epoch": 0.8875128998968008, "grad_norm": 0.5129926015439157, "learning_rate": 2.8673263113097137e-06, "loss": 0.5217, "step": 4300 }, { "epoch": 0.8978328173374613, "grad_norm": 0.5408976129024315, "learning_rate": 2.8407277369932973e-06, "loss": 0.5298, "step": 4350 }, { "epoch": 0.9081527347781218, "grad_norm": 0.523128008449258, "learning_rate": 2.8141291626768805e-06, "loss": 0.5236, "step": 4400 }, { "epoch": 0.9184726522187823, "grad_norm": 0.545708918135539, "learning_rate": 2.787530588360464e-06, "loss": 0.516, "step": 4450 }, { "epoch": 0.9287925696594427, "grad_norm": 0.5643988024472154, "learning_rate": 2.760932014044048e-06, "loss": 0.5157, "step": 4500 }, { "epoch": 0.9391124871001032, "grad_norm": 0.49490206521060065, "learning_rate": 2.7343334397276306e-06, "loss": 0.5143, "step": 4550 }, { "epoch": 0.9494324045407637, "grad_norm": 0.530477676869888, "learning_rate": 2.707734865411214e-06, "loss": 0.5097, "step": 4600 }, { "epoch": 0.9597523219814241, "grad_norm": 0.5419644910097084, "learning_rate": 2.6811362910947975e-06, "loss": 0.5047, "step": 4650 }, { "epoch": 0.9700722394220846, "grad_norm": 0.563023481870365, "learning_rate": 2.654537716778381e-06, "loss": 0.51, "step": 4700 }, { "epoch": 0.9803921568627451, "grad_norm": 0.5885551415729929, "learning_rate": 2.6279391424619643e-06, "loss": 0.5141, "step": 4750 }, { "epoch": 0.9907120743034056, "grad_norm": 0.555238399523142, "learning_rate": 2.601340568145548e-06, "loss": 0.5144, "step": 4800 }, { "epoch": 1.0, "eval_loss": 0.5763986110687256, "eval_runtime": 22.6777, "eval_samples_per_second": 23.724, "eval_steps_per_second": 0.397, "step": 4845 }, { "epoch": 1.001031991744066, "grad_norm": 0.5251455186752938, "learning_rate": 2.5747419938291308e-06, "loss": 0.5043, "step": 4850 }, { "epoch": 1.0113519091847265, "grad_norm": 0.5102862437645022, "learning_rate": 2.548143419512714e-06, "loss": 0.4557, "step": 4900 }, { "epoch": 1.021671826625387, "grad_norm": 0.5329812930765626, "learning_rate": 2.5215448451962976e-06, "loss": 0.4593, "step": 4950 }, { "epoch": 1.0319917440660475, "grad_norm": 0.5310044546142733, "learning_rate": 2.4949462708798813e-06, "loss": 0.4601, "step": 5000 }, { "epoch": 1.0423116615067078, "grad_norm": 0.5291259662919947, "learning_rate": 2.4683476965634645e-06, "loss": 0.4573, "step": 5050 }, { "epoch": 1.0526315789473684, "grad_norm": 0.5758879453674484, "learning_rate": 2.4417491222470477e-06, "loss": 0.4545, "step": 5100 }, { "epoch": 1.0629514963880289, "grad_norm": 0.5701692974214282, "learning_rate": 2.4151505479306313e-06, "loss": 0.4542, "step": 5150 }, { "epoch": 1.0732714138286894, "grad_norm": 0.584779025941461, "learning_rate": 2.3885519736142146e-06, "loss": 0.4494, "step": 5200 }, { "epoch": 1.08359133126935, "grad_norm": 0.5634926571145632, "learning_rate": 2.3619533992977978e-06, "loss": 0.4453, "step": 5250 }, { "epoch": 1.0939112487100102, "grad_norm": 0.5287888561997275, "learning_rate": 2.3353548249813814e-06, "loss": 0.4494, "step": 5300 }, { "epoch": 1.1042311661506707, "grad_norm": 0.566118676754179, "learning_rate": 2.3087562506649646e-06, "loss": 0.4472, "step": 5350 }, { "epoch": 1.1145510835913313, "grad_norm": 0.5591675478155297, "learning_rate": 2.282157676348548e-06, "loss": 0.4487, "step": 5400 }, { "epoch": 1.1248710010319918, "grad_norm": 0.5234427712256148, "learning_rate": 2.2555591020321315e-06, "loss": 0.4564, "step": 5450 }, { "epoch": 1.1351909184726523, "grad_norm": 0.5341771123435606, "learning_rate": 2.2289605277157147e-06, "loss": 0.4546, "step": 5500 }, { "epoch": 1.1455108359133126, "grad_norm": 0.5963942909826432, "learning_rate": 2.202361953399298e-06, "loss": 0.4523, "step": 5550 }, { "epoch": 1.1558307533539731, "grad_norm": 0.5668397788463199, "learning_rate": 2.1757633790828816e-06, "loss": 0.456, "step": 5600 }, { "epoch": 1.1661506707946336, "grad_norm": 0.5324369168992059, "learning_rate": 2.1491648047664648e-06, "loss": 0.458, "step": 5650 }, { "epoch": 1.1764705882352942, "grad_norm": 0.5936240911653264, "learning_rate": 2.122566230450048e-06, "loss": 0.4406, "step": 5700 }, { "epoch": 1.1867905056759547, "grad_norm": 0.5813588072096951, "learning_rate": 2.0959676561336316e-06, "loss": 0.4542, "step": 5750 }, { "epoch": 1.197110423116615, "grad_norm": 0.585714648308596, "learning_rate": 2.0693690818172144e-06, "loss": 0.4435, "step": 5800 }, { "epoch": 1.2074303405572755, "grad_norm": 0.5428768784930765, "learning_rate": 2.042770507500798e-06, "loss": 0.4468, "step": 5850 }, { "epoch": 1.217750257997936, "grad_norm": 0.5368254232870538, "learning_rate": 2.0161719331843817e-06, "loss": 0.4538, "step": 5900 }, { "epoch": 1.2280701754385965, "grad_norm": 0.548798947651045, "learning_rate": 1.9895733588679645e-06, "loss": 0.447, "step": 5950 }, { "epoch": 1.238390092879257, "grad_norm": 0.612099006418541, "learning_rate": 1.962974784551548e-06, "loss": 0.4515, "step": 6000 }, { "epoch": 1.2487100103199174, "grad_norm": 0.55639703440266, "learning_rate": 1.936376210235132e-06, "loss": 0.446, "step": 6050 }, { "epoch": 1.2590299277605779, "grad_norm": 0.5807813468059144, "learning_rate": 1.9097776359187146e-06, "loss": 0.4518, "step": 6100 }, { "epoch": 1.2693498452012384, "grad_norm": 0.5654058519092208, "learning_rate": 1.8831790616022982e-06, "loss": 0.4458, "step": 6150 }, { "epoch": 1.279669762641899, "grad_norm": 0.5935622910169027, "learning_rate": 1.8565804872858817e-06, "loss": 0.4539, "step": 6200 }, { "epoch": 1.2899896800825594, "grad_norm": 0.5320246374099179, "learning_rate": 1.8299819129694649e-06, "loss": 0.4523, "step": 6250 }, { "epoch": 1.3003095975232197, "grad_norm": 0.5993250962476825, "learning_rate": 1.8033833386530483e-06, "loss": 0.4525, "step": 6300 }, { "epoch": 1.3106295149638802, "grad_norm": 0.5706555061755686, "learning_rate": 1.7767847643366317e-06, "loss": 0.448, "step": 6350 }, { "epoch": 1.3209494324045408, "grad_norm": 0.5604368666415582, "learning_rate": 1.750186190020215e-06, "loss": 0.4547, "step": 6400 }, { "epoch": 1.3312693498452013, "grad_norm": 0.5368477750147255, "learning_rate": 1.7235876157037984e-06, "loss": 0.4522, "step": 6450 }, { "epoch": 1.3415892672858618, "grad_norm": 0.608531892658329, "learning_rate": 1.6969890413873818e-06, "loss": 0.4529, "step": 6500 }, { "epoch": 1.351909184726522, "grad_norm": 0.5524704535036694, "learning_rate": 1.670390467070965e-06, "loss": 0.4449, "step": 6550 }, { "epoch": 1.3622291021671826, "grad_norm": 0.608361151275691, "learning_rate": 1.6437918927545484e-06, "loss": 0.449, "step": 6600 }, { "epoch": 1.3725490196078431, "grad_norm": 0.5350140506451774, "learning_rate": 1.6171933184381319e-06, "loss": 0.4448, "step": 6650 }, { "epoch": 1.3828689370485037, "grad_norm": 0.5657631888716299, "learning_rate": 1.590594744121715e-06, "loss": 0.4505, "step": 6700 }, { "epoch": 1.3931888544891642, "grad_norm": 0.5638748961635498, "learning_rate": 1.5639961698052985e-06, "loss": 0.4516, "step": 6750 }, { "epoch": 1.4035087719298245, "grad_norm": 0.5838904002390698, "learning_rate": 1.537397595488882e-06, "loss": 0.4434, "step": 6800 }, { "epoch": 1.413828689370485, "grad_norm": 0.582688659499904, "learning_rate": 1.5107990211724652e-06, "loss": 0.4451, "step": 6850 }, { "epoch": 1.4241486068111455, "grad_norm": 0.5666153072627447, "learning_rate": 1.4842004468560486e-06, "loss": 0.4513, "step": 6900 }, { "epoch": 1.434468524251806, "grad_norm": 0.5830919184881627, "learning_rate": 1.457601872539632e-06, "loss": 0.4499, "step": 6950 }, { "epoch": 1.4447884416924666, "grad_norm": 0.5392957954275294, "learning_rate": 1.4310032982232152e-06, "loss": 0.4526, "step": 7000 }, { "epoch": 1.4551083591331269, "grad_norm": 0.5796751590874428, "learning_rate": 1.4044047239067987e-06, "loss": 0.4464, "step": 7050 }, { "epoch": 1.4654282765737874, "grad_norm": 0.649115716424128, "learning_rate": 1.377806149590382e-06, "loss": 0.4448, "step": 7100 }, { "epoch": 1.475748194014448, "grad_norm": 0.5437354389754013, "learning_rate": 1.3512075752739653e-06, "loss": 0.4488, "step": 7150 }, { "epoch": 1.4860681114551084, "grad_norm": 0.575952575422212, "learning_rate": 1.3246090009575488e-06, "loss": 0.4445, "step": 7200 }, { "epoch": 1.496388028895769, "grad_norm": 0.5689643540076003, "learning_rate": 1.2980104266411322e-06, "loss": 0.4427, "step": 7250 }, { "epoch": 1.5067079463364292, "grad_norm": 0.5433467470767702, "learning_rate": 1.2714118523247154e-06, "loss": 0.4469, "step": 7300 }, { "epoch": 1.5170278637770898, "grad_norm": 0.5785369307750878, "learning_rate": 1.2448132780082988e-06, "loss": 0.4418, "step": 7350 }, { "epoch": 1.5273477812177503, "grad_norm": 0.5946065204485477, "learning_rate": 1.2182147036918823e-06, "loss": 0.45, "step": 7400 }, { "epoch": 1.5376676986584106, "grad_norm": 0.5276268731765017, "learning_rate": 1.1916161293754657e-06, "loss": 0.4425, "step": 7450 }, { "epoch": 1.5479876160990713, "grad_norm": 0.5456969115711289, "learning_rate": 1.165017555059049e-06, "loss": 0.4426, "step": 7500 }, { "epoch": 1.5583075335397316, "grad_norm": 0.5626443461851722, "learning_rate": 1.1384189807426323e-06, "loss": 0.4457, "step": 7550 }, { "epoch": 1.5686274509803921, "grad_norm": 0.5748093743725187, "learning_rate": 1.1118204064262155e-06, "loss": 0.4425, "step": 7600 }, { "epoch": 1.5789473684210527, "grad_norm": 0.6398790495774637, "learning_rate": 1.085221832109799e-06, "loss": 0.4461, "step": 7650 }, { "epoch": 1.589267285861713, "grad_norm": 0.5502896358409545, "learning_rate": 1.0586232577933824e-06, "loss": 0.4324, "step": 7700 }, { "epoch": 1.5995872033023737, "grad_norm": 0.547970248426798, "learning_rate": 1.0320246834769656e-06, "loss": 0.4367, "step": 7750 }, { "epoch": 1.609907120743034, "grad_norm": 0.5553997442333781, "learning_rate": 1.005426109160549e-06, "loss": 0.4518, "step": 7800 }, { "epoch": 1.6202270381836945, "grad_norm": 0.5463386099370533, "learning_rate": 9.788275348441325e-07, "loss": 0.4494, "step": 7850 }, { "epoch": 1.630546955624355, "grad_norm": 0.5924906419908685, "learning_rate": 9.522289605277157e-07, "loss": 0.441, "step": 7900 }, { "epoch": 1.6408668730650153, "grad_norm": 0.5565206258809997, "learning_rate": 9.256303862112992e-07, "loss": 0.4417, "step": 7950 }, { "epoch": 1.651186790505676, "grad_norm": 0.587012594186078, "learning_rate": 8.990318118948826e-07, "loss": 0.4442, "step": 8000 }, { "epoch": 1.6615067079463364, "grad_norm": 0.5771632055191758, "learning_rate": 8.724332375784658e-07, "loss": 0.4456, "step": 8050 }, { "epoch": 1.671826625386997, "grad_norm": 0.56799994088298, "learning_rate": 8.458346632620493e-07, "loss": 0.4486, "step": 8100 }, { "epoch": 1.6821465428276574, "grad_norm": 0.569590541861768, "learning_rate": 8.192360889456326e-07, "loss": 0.449, "step": 8150 }, { "epoch": 1.6924664602683177, "grad_norm": 0.5795380939294089, "learning_rate": 7.926375146292158e-07, "loss": 0.4438, "step": 8200 }, { "epoch": 1.7027863777089784, "grad_norm": 0.5706726979508648, "learning_rate": 7.660389403127994e-07, "loss": 0.4443, "step": 8250 }, { "epoch": 1.7131062951496387, "grad_norm": 0.5791014554749523, "learning_rate": 7.394403659963827e-07, "loss": 0.4481, "step": 8300 }, { "epoch": 1.7234262125902993, "grad_norm": 0.561090943078198, "learning_rate": 7.12841791679966e-07, "loss": 0.4397, "step": 8350 }, { "epoch": 1.7337461300309598, "grad_norm": 0.5863685907601032, "learning_rate": 6.862432173635495e-07, "loss": 0.4445, "step": 8400 }, { "epoch": 1.74406604747162, "grad_norm": 0.5730575809697048, "learning_rate": 6.596446430471328e-07, "loss": 0.4451, "step": 8450 }, { "epoch": 1.7543859649122808, "grad_norm": 0.5635343819200634, "learning_rate": 6.330460687307161e-07, "loss": 0.4382, "step": 8500 }, { "epoch": 1.7647058823529411, "grad_norm": 0.5646651214526662, "learning_rate": 6.064474944142994e-07, "loss": 0.4403, "step": 8550 }, { "epoch": 1.7750257997936016, "grad_norm": 0.5486977071368843, "learning_rate": 5.798489200978829e-07, "loss": 0.4355, "step": 8600 }, { "epoch": 1.7853457172342622, "grad_norm": 0.5568438386062177, "learning_rate": 5.532503457814662e-07, "loss": 0.4477, "step": 8650 }, { "epoch": 1.7956656346749225, "grad_norm": 0.5622396972214505, "learning_rate": 5.266517714650495e-07, "loss": 0.4377, "step": 8700 }, { "epoch": 1.8059855521155832, "grad_norm": 0.552378184120207, "learning_rate": 5.000531971486329e-07, "loss": 0.4467, "step": 8750 }, { "epoch": 1.8163054695562435, "grad_norm": 0.5708548011902037, "learning_rate": 4.734546228322162e-07, "loss": 0.439, "step": 8800 }, { "epoch": 1.826625386996904, "grad_norm": 0.6282302095096289, "learning_rate": 4.468560485157996e-07, "loss": 0.4415, "step": 8850 }, { "epoch": 1.8369453044375645, "grad_norm": 0.5526339563083117, "learning_rate": 4.20257474199383e-07, "loss": 0.4434, "step": 8900 }, { "epoch": 1.8472652218782248, "grad_norm": 0.5627366388182174, "learning_rate": 3.936588998829663e-07, "loss": 0.4406, "step": 8950 }, { "epoch": 1.8575851393188856, "grad_norm": 0.5675273603944594, "learning_rate": 3.6706032556654965e-07, "loss": 0.4364, "step": 9000 }, { "epoch": 1.8679050567595459, "grad_norm": 0.6096156008730134, "learning_rate": 3.40461751250133e-07, "loss": 0.4421, "step": 9050 }, { "epoch": 1.8782249742002064, "grad_norm": 0.655037790258665, "learning_rate": 3.1386317693371635e-07, "loss": 0.4456, "step": 9100 }, { "epoch": 1.888544891640867, "grad_norm": 0.5522478423907845, "learning_rate": 2.8726460261729973e-07, "loss": 0.4399, "step": 9150 }, { "epoch": 1.8988648090815272, "grad_norm": 0.5499766592623824, "learning_rate": 2.606660283008831e-07, "loss": 0.4447, "step": 9200 }, { "epoch": 1.909184726522188, "grad_norm": 0.6047531326573264, "learning_rate": 2.3406745398446645e-07, "loss": 0.4483, "step": 9250 }, { "epoch": 1.9195046439628483, "grad_norm": 0.5307110417886833, "learning_rate": 2.074688796680498e-07, "loss": 0.4474, "step": 9300 }, { "epoch": 1.9298245614035088, "grad_norm": 0.5838188935676187, "learning_rate": 1.8087030535163318e-07, "loss": 0.4453, "step": 9350 }, { "epoch": 1.9401444788441693, "grad_norm": 0.524912113097606, "learning_rate": 1.5427173103521653e-07, "loss": 0.4441, "step": 9400 }, { "epoch": 1.9504643962848296, "grad_norm": 0.5892672773445452, "learning_rate": 1.2767315671879988e-07, "loss": 0.4368, "step": 9450 }, { "epoch": 1.9607843137254903, "grad_norm": 0.5480626100352609, "learning_rate": 1.0107458240238324e-07, "loss": 0.4408, "step": 9500 }, { "epoch": 1.9711042311661506, "grad_norm": 0.5668784462525065, "learning_rate": 7.44760080859666e-08, "loss": 0.4435, "step": 9550 }, { "epoch": 1.9814241486068112, "grad_norm": 0.6110658269716644, "learning_rate": 4.787743376954996e-08, "loss": 0.4431, "step": 9600 }, { "epoch": 1.9917440660474717, "grad_norm": 0.5446976742385811, "learning_rate": 2.127885945313331e-08, "loss": 0.4367, "step": 9650 }, { "epoch": 2.0, "eval_loss": 0.5610442161560059, "eval_runtime": 20.0148, "eval_samples_per_second": 26.88, "eval_steps_per_second": 0.45, "step": 9690 }, { "epoch": 2.0, "step": 9690, "total_flos": 4057777727078400.0, "train_loss": 0.5052211482704485, "train_runtime": 135745.8608, "train_samples_per_second": 4.568, "train_steps_per_second": 0.071 } ], "logging_steps": 50, "max_steps": 9690, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4057777727078400.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }