{ "best_metric": 1.300666093826294, "best_model_checkpoint": "4bit_repro_03022025/host10_seed_42_full_det_fp16_no_flash_attn_fix_pad_gemma-2-9b-it-l16-cot-4ep-lr3e04-ws20-bs4-ga4-fp16-11022025/checkpoint-109", "epoch": 3.0, "eval_steps": 500, "global_step": 327, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009195402298850575, "grad_norm": 0.9957932233810425, "learning_rate": 1.4999999999999999e-05, "loss": 2.3724, "step": 1 }, { "epoch": 0.01839080459770115, "grad_norm": 0.7591073513031006, "learning_rate": 2.9999999999999997e-05, "loss": 2.3711, "step": 2 }, { "epoch": 0.027586206896551724, "grad_norm": 0.9023211002349854, "learning_rate": 4.4999999999999996e-05, "loss": 2.2454, "step": 3 }, { "epoch": 0.0367816091954023, "grad_norm": 0.7705220580101013, "learning_rate": 5.9999999999999995e-05, "loss": 2.1252, "step": 4 }, { "epoch": 0.04597701149425287, "grad_norm": 0.8994729518890381, "learning_rate": 7.5e-05, "loss": 2.1776, "step": 5 }, { "epoch": 0.05517241379310345, "grad_norm": 0.8370222449302673, "learning_rate": 8.999999999999999e-05, "loss": 2.0403, "step": 6 }, { "epoch": 0.06436781609195402, "grad_norm": 0.4772511124610901, "learning_rate": 0.00010499999999999999, "loss": 2.0642, "step": 7 }, { "epoch": 0.0735632183908046, "grad_norm": 0.3877703547477722, "learning_rate": 0.00011999999999999999, "loss": 2.0111, "step": 8 }, { "epoch": 0.08275862068965517, "grad_norm": 0.4768696129322052, "learning_rate": 0.000135, "loss": 1.8875, "step": 9 }, { "epoch": 0.09195402298850575, "grad_norm": 0.6014317870140076, "learning_rate": 0.00015, "loss": 1.672, "step": 10 }, { "epoch": 0.10114942528735632, "grad_norm": 0.6302416324615479, "learning_rate": 0.000165, "loss": 1.7088, "step": 11 }, { "epoch": 0.1103448275862069, "grad_norm": 0.5539880990982056, "learning_rate": 0.00017999999999999998, "loss": 1.6913, "step": 12 }, { "epoch": 0.11954022988505747, "grad_norm": 0.6798604130744934, "learning_rate": 0.000195, "loss": 1.4942, "step": 13 }, { "epoch": 0.12873563218390804, "grad_norm": 0.4399753510951996, "learning_rate": 0.00020999999999999998, "loss": 1.6453, "step": 14 }, { "epoch": 0.13793103448275862, "grad_norm": 0.37119659781455994, "learning_rate": 0.000225, "loss": 1.6376, "step": 15 }, { "epoch": 0.1471264367816092, "grad_norm": 0.38230371475219727, "learning_rate": 0.00023999999999999998, "loss": 1.8295, "step": 16 }, { "epoch": 0.15632183908045977, "grad_norm": 0.4516810476779938, "learning_rate": 0.00025499999999999996, "loss": 1.3166, "step": 17 }, { "epoch": 0.16551724137931034, "grad_norm": 0.6360406875610352, "learning_rate": 0.00027, "loss": 1.3089, "step": 18 }, { "epoch": 0.17471264367816092, "grad_norm": 0.3828903138637543, "learning_rate": 0.000285, "loss": 1.3322, "step": 19 }, { "epoch": 0.1839080459770115, "grad_norm": 0.3432200253009796, "learning_rate": 0.0003, "loss": 1.5339, "step": 20 }, { "epoch": 0.19310344827586207, "grad_norm": 0.278246134519577, "learning_rate": 0.00029927184466019415, "loss": 1.6185, "step": 21 }, { "epoch": 0.20229885057471264, "grad_norm": 0.278033047914505, "learning_rate": 0.00029854368932038833, "loss": 1.5488, "step": 22 }, { "epoch": 0.21149425287356322, "grad_norm": 0.35856395959854126, "learning_rate": 0.0002978155339805825, "loss": 1.3512, "step": 23 }, { "epoch": 0.2206896551724138, "grad_norm": 0.3614782989025116, "learning_rate": 0.0002970873786407767, "loss": 1.4487, "step": 24 }, { "epoch": 0.22988505747126436, "grad_norm": 0.30157631635665894, "learning_rate": 0.00029635922330097087, "loss": 1.6049, "step": 25 }, { "epoch": 0.23908045977011494, "grad_norm": 0.35453277826309204, "learning_rate": 0.00029563106796116505, "loss": 1.449, "step": 26 }, { "epoch": 0.2482758620689655, "grad_norm": 0.436924546957016, "learning_rate": 0.0002949029126213592, "loss": 1.3822, "step": 27 }, { "epoch": 0.2574712643678161, "grad_norm": 0.3903788626194, "learning_rate": 0.00029417475728155335, "loss": 1.33, "step": 28 }, { "epoch": 0.26666666666666666, "grad_norm": 0.39776864647865295, "learning_rate": 0.00029344660194174753, "loss": 1.5852, "step": 29 }, { "epoch": 0.27586206896551724, "grad_norm": 0.3369416296482086, "learning_rate": 0.0002927184466019417, "loss": 1.7242, "step": 30 }, { "epoch": 0.2850574712643678, "grad_norm": 0.3887549936771393, "learning_rate": 0.0002919902912621359, "loss": 1.5086, "step": 31 }, { "epoch": 0.2942528735632184, "grad_norm": 0.3567999005317688, "learning_rate": 0.00029126213592233006, "loss": 1.2896, "step": 32 }, { "epoch": 0.30344827586206896, "grad_norm": 0.45827871561050415, "learning_rate": 0.00029053398058252424, "loss": 1.1841, "step": 33 }, { "epoch": 0.31264367816091954, "grad_norm": 0.45487773418426514, "learning_rate": 0.0002898058252427184, "loss": 1.5209, "step": 34 }, { "epoch": 0.3218390804597701, "grad_norm": 0.44364598393440247, "learning_rate": 0.0002890776699029126, "loss": 1.5646, "step": 35 }, { "epoch": 0.3310344827586207, "grad_norm": 0.4502098560333252, "learning_rate": 0.0002883495145631068, "loss": 1.3928, "step": 36 }, { "epoch": 0.34022988505747126, "grad_norm": 0.36199966073036194, "learning_rate": 0.00028762135922330096, "loss": 1.4251, "step": 37 }, { "epoch": 0.34942528735632183, "grad_norm": 0.3491019308567047, "learning_rate": 0.00028689320388349513, "loss": 1.5562, "step": 38 }, { "epoch": 0.3586206896551724, "grad_norm": 0.5400763750076294, "learning_rate": 0.0002861650485436893, "loss": 1.2608, "step": 39 }, { "epoch": 0.367816091954023, "grad_norm": 0.3730115592479706, "learning_rate": 0.0002854368932038835, "loss": 1.5525, "step": 40 }, { "epoch": 0.37701149425287356, "grad_norm": 0.3588751554489136, "learning_rate": 0.00028470873786407767, "loss": 1.3468, "step": 41 }, { "epoch": 0.38620689655172413, "grad_norm": 0.5940146446228027, "learning_rate": 0.00028398058252427185, "loss": 1.185, "step": 42 }, { "epoch": 0.3954022988505747, "grad_norm": 0.394444078207016, "learning_rate": 0.00028325242718446603, "loss": 1.3057, "step": 43 }, { "epoch": 0.4045977011494253, "grad_norm": 0.4431244730949402, "learning_rate": 0.00028252427184466015, "loss": 1.3147, "step": 44 }, { "epoch": 0.41379310344827586, "grad_norm": 0.4967786371707916, "learning_rate": 0.00028179611650485433, "loss": 1.1773, "step": 45 }, { "epoch": 0.42298850574712643, "grad_norm": 0.38681459426879883, "learning_rate": 0.0002810679611650485, "loss": 1.4481, "step": 46 }, { "epoch": 0.432183908045977, "grad_norm": 0.4664541780948639, "learning_rate": 0.0002803398058252427, "loss": 1.3254, "step": 47 }, { "epoch": 0.4413793103448276, "grad_norm": 0.433729350566864, "learning_rate": 0.00027961165048543687, "loss": 1.2067, "step": 48 }, { "epoch": 0.45057471264367815, "grad_norm": 0.4609008729457855, "learning_rate": 0.00027888349514563105, "loss": 1.1925, "step": 49 }, { "epoch": 0.45977011494252873, "grad_norm": 0.5346646904945374, "learning_rate": 0.0002781553398058252, "loss": 1.1117, "step": 50 }, { "epoch": 0.4689655172413793, "grad_norm": 0.4348887801170349, "learning_rate": 0.0002774271844660194, "loss": 1.3088, "step": 51 }, { "epoch": 0.4781609195402299, "grad_norm": 0.4136529266834259, "learning_rate": 0.0002766990291262136, "loss": 1.1332, "step": 52 }, { "epoch": 0.48735632183908045, "grad_norm": 0.580917239189148, "learning_rate": 0.00027597087378640776, "loss": 1.1903, "step": 53 }, { "epoch": 0.496551724137931, "grad_norm": 0.5748546719551086, "learning_rate": 0.00027524271844660194, "loss": 1.0639, "step": 54 }, { "epoch": 0.5057471264367817, "grad_norm": 0.45053598284721375, "learning_rate": 0.0002745145631067961, "loss": 1.2918, "step": 55 }, { "epoch": 0.5149425287356322, "grad_norm": 0.5989317893981934, "learning_rate": 0.0002737864077669903, "loss": 1.2734, "step": 56 }, { "epoch": 0.5241379310344828, "grad_norm": 0.5094353556632996, "learning_rate": 0.0002730582524271845, "loss": 1.063, "step": 57 }, { "epoch": 0.5333333333333333, "grad_norm": 0.5418046712875366, "learning_rate": 0.00027233009708737865, "loss": 1.1931, "step": 58 }, { "epoch": 0.542528735632184, "grad_norm": 0.5898640155792236, "learning_rate": 0.0002716019417475728, "loss": 1.1561, "step": 59 }, { "epoch": 0.5517241379310345, "grad_norm": 0.5417141914367676, "learning_rate": 0.00027087378640776696, "loss": 1.1772, "step": 60 }, { "epoch": 0.5609195402298851, "grad_norm": 0.5419390201568604, "learning_rate": 0.00027014563106796114, "loss": 1.1019, "step": 61 }, { "epoch": 0.5701149425287356, "grad_norm": 0.5117617845535278, "learning_rate": 0.0002694174757281553, "loss": 1.026, "step": 62 }, { "epoch": 0.5793103448275863, "grad_norm": 0.6913059949874878, "learning_rate": 0.0002686893203883495, "loss": 1.0382, "step": 63 }, { "epoch": 0.5885057471264368, "grad_norm": 0.7260013818740845, "learning_rate": 0.00026796116504854367, "loss": 1.0667, "step": 64 }, { "epoch": 0.5977011494252874, "grad_norm": 0.831520676612854, "learning_rate": 0.00026723300970873785, "loss": 1.2295, "step": 65 }, { "epoch": 0.6068965517241379, "grad_norm": 0.5325063467025757, "learning_rate": 0.00026650485436893203, "loss": 0.8899, "step": 66 }, { "epoch": 0.6160919540229886, "grad_norm": 0.5867132544517517, "learning_rate": 0.0002657766990291262, "loss": 1.1095, "step": 67 }, { "epoch": 0.6252873563218391, "grad_norm": 0.6068809628486633, "learning_rate": 0.0002650485436893204, "loss": 1.075, "step": 68 }, { "epoch": 0.6344827586206897, "grad_norm": 0.6909754872322083, "learning_rate": 0.00026432038834951456, "loss": 1.1673, "step": 69 }, { "epoch": 0.6436781609195402, "grad_norm": 0.7632415294647217, "learning_rate": 0.00026359223300970874, "loss": 1.1162, "step": 70 }, { "epoch": 0.6528735632183909, "grad_norm": 0.6888180375099182, "learning_rate": 0.0002628640776699029, "loss": 0.8784, "step": 71 }, { "epoch": 0.6620689655172414, "grad_norm": 0.6300679445266724, "learning_rate": 0.00026213592233009705, "loss": 0.8871, "step": 72 }, { "epoch": 0.671264367816092, "grad_norm": 0.6151922941207886, "learning_rate": 0.0002614077669902912, "loss": 0.9701, "step": 73 }, { "epoch": 0.6804597701149425, "grad_norm": 0.6928962469100952, "learning_rate": 0.0002606796116504854, "loss": 1.0456, "step": 74 }, { "epoch": 0.6896551724137931, "grad_norm": 0.848274827003479, "learning_rate": 0.0002599514563106796, "loss": 1.1089, "step": 75 }, { "epoch": 0.6988505747126437, "grad_norm": 1.1162086725234985, "learning_rate": 0.00025922330097087376, "loss": 1.1541, "step": 76 }, { "epoch": 0.7080459770114943, "grad_norm": 0.6158594489097595, "learning_rate": 0.00025849514563106794, "loss": 1.1431, "step": 77 }, { "epoch": 0.7172413793103448, "grad_norm": 0.6790457367897034, "learning_rate": 0.0002577669902912621, "loss": 0.9188, "step": 78 }, { "epoch": 0.7264367816091954, "grad_norm": 0.844353973865509, "learning_rate": 0.0002570388349514563, "loss": 1.2385, "step": 79 }, { "epoch": 0.735632183908046, "grad_norm": 0.645825207233429, "learning_rate": 0.0002563106796116505, "loss": 1.0516, "step": 80 }, { "epoch": 0.7448275862068966, "grad_norm": 0.5313035249710083, "learning_rate": 0.0002555825242718446, "loss": 1.046, "step": 81 }, { "epoch": 0.7540229885057471, "grad_norm": 0.6045447587966919, "learning_rate": 0.0002548543689320388, "loss": 0.9456, "step": 82 }, { "epoch": 0.7632183908045977, "grad_norm": 1.1198372840881348, "learning_rate": 0.00025412621359223296, "loss": 1.1684, "step": 83 }, { "epoch": 0.7724137931034483, "grad_norm": 0.6908996105194092, "learning_rate": 0.00025339805825242714, "loss": 1.1155, "step": 84 }, { "epoch": 0.7816091954022989, "grad_norm": 0.5916021466255188, "learning_rate": 0.0002526699029126213, "loss": 1.0781, "step": 85 }, { "epoch": 0.7908045977011494, "grad_norm": 0.8320909738540649, "learning_rate": 0.0002519417475728155, "loss": 0.7954, "step": 86 }, { "epoch": 0.8, "grad_norm": 0.6855669617652893, "learning_rate": 0.00025121359223300967, "loss": 0.8968, "step": 87 }, { "epoch": 0.8091954022988506, "grad_norm": 0.6343971490859985, "learning_rate": 0.00025048543689320385, "loss": 0.9878, "step": 88 }, { "epoch": 0.8183908045977012, "grad_norm": 0.5901280641555786, "learning_rate": 0.00024975728155339803, "loss": 1.0158, "step": 89 }, { "epoch": 0.8275862068965517, "grad_norm": 0.5881906151771545, "learning_rate": 0.0002490291262135922, "loss": 0.9432, "step": 90 }, { "epoch": 0.8367816091954023, "grad_norm": 0.749911367893219, "learning_rate": 0.0002483009708737864, "loss": 0.8734, "step": 91 }, { "epoch": 0.8459770114942529, "grad_norm": 0.6801809668540955, "learning_rate": 0.00024757281553398056, "loss": 0.7291, "step": 92 }, { "epoch": 0.8551724137931035, "grad_norm": 0.5692021250724792, "learning_rate": 0.00024684466019417474, "loss": 0.976, "step": 93 }, { "epoch": 0.864367816091954, "grad_norm": 0.8974186182022095, "learning_rate": 0.0002461165048543689, "loss": 0.8702, "step": 94 }, { "epoch": 0.8735632183908046, "grad_norm": 0.7381451725959778, "learning_rate": 0.0002453883495145631, "loss": 0.832, "step": 95 }, { "epoch": 0.8827586206896552, "grad_norm": 0.5254162549972534, "learning_rate": 0.0002446601941747572, "loss": 0.7984, "step": 96 }, { "epoch": 0.8919540229885058, "grad_norm": 0.6398109197616577, "learning_rate": 0.00024393203883495143, "loss": 0.9934, "step": 97 }, { "epoch": 0.9011494252873563, "grad_norm": 0.6260827779769897, "learning_rate": 0.0002432038834951456, "loss": 1.049, "step": 98 }, { "epoch": 0.9103448275862069, "grad_norm": 0.7606930136680603, "learning_rate": 0.0002424757281553398, "loss": 0.8291, "step": 99 }, { "epoch": 0.9195402298850575, "grad_norm": 0.5728839039802551, "learning_rate": 0.00024174757281553394, "loss": 0.9048, "step": 100 }, { "epoch": 0.9287356321839081, "grad_norm": 0.8986696600914001, "learning_rate": 0.00024101941747572812, "loss": 0.8363, "step": 101 }, { "epoch": 0.9379310344827586, "grad_norm": 1.1614326238632202, "learning_rate": 0.0002402912621359223, "loss": 0.8226, "step": 102 }, { "epoch": 0.9471264367816092, "grad_norm": 0.6037288904190063, "learning_rate": 0.00023956310679611648, "loss": 0.9164, "step": 103 }, { "epoch": 0.9563218390804598, "grad_norm": 0.5823872685432434, "learning_rate": 0.00023883495145631065, "loss": 0.9207, "step": 104 }, { "epoch": 0.9655172413793104, "grad_norm": 0.6235625147819519, "learning_rate": 0.00023810679611650483, "loss": 0.762, "step": 105 }, { "epoch": 0.9747126436781609, "grad_norm": 0.5390623807907104, "learning_rate": 0.000237378640776699, "loss": 1.0991, "step": 106 }, { "epoch": 0.9839080459770115, "grad_norm": 0.5815601944923401, "learning_rate": 0.0002366504854368932, "loss": 0.9363, "step": 107 }, { "epoch": 0.993103448275862, "grad_norm": 0.5816079378128052, "learning_rate": 0.00023592233009708734, "loss": 0.7259, "step": 108 }, { "epoch": 1.0, "grad_norm": 2.0381648540496826, "learning_rate": 0.00023519417475728152, "loss": 0.8426, "step": 109 }, { "epoch": 1.0, "eval_loss": 1.300666093826294, "eval_runtime": 94.5929, "eval_samples_per_second": 3.51, "eval_steps_per_second": 1.755, "step": 109 }, { "epoch": 1.0091954022988505, "grad_norm": 0.5967744588851929, "learning_rate": 0.0002344660194174757, "loss": 0.7901, "step": 110 }, { "epoch": 1.018390804597701, "grad_norm": 0.5640788078308105, "learning_rate": 0.00023373786407766988, "loss": 0.6748, "step": 111 }, { "epoch": 1.0275862068965518, "grad_norm": 0.5988382697105408, "learning_rate": 0.00023300970873786406, "loss": 0.7256, "step": 112 }, { "epoch": 1.0367816091954023, "grad_norm": 0.8283673524856567, "learning_rate": 0.00023228155339805823, "loss": 0.5817, "step": 113 }, { "epoch": 1.0459770114942528, "grad_norm": 0.5982439517974854, "learning_rate": 0.0002315533980582524, "loss": 0.8342, "step": 114 }, { "epoch": 1.0551724137931036, "grad_norm": 0.9253256916999817, "learning_rate": 0.0002308252427184466, "loss": 0.6173, "step": 115 }, { "epoch": 1.064367816091954, "grad_norm": 0.58003830909729, "learning_rate": 0.00023009708737864074, "loss": 0.6153, "step": 116 }, { "epoch": 1.0735632183908046, "grad_norm": 0.9149603843688965, "learning_rate": 0.00022936893203883492, "loss": 0.7499, "step": 117 }, { "epoch": 1.0827586206896551, "grad_norm": 0.4855748414993286, "learning_rate": 0.0002286407766990291, "loss": 0.8476, "step": 118 }, { "epoch": 1.0919540229885056, "grad_norm": 0.5189658403396606, "learning_rate": 0.00022791262135922328, "loss": 0.6332, "step": 119 }, { "epoch": 1.1011494252873564, "grad_norm": 0.6228414177894592, "learning_rate": 0.00022718446601941746, "loss": 0.6543, "step": 120 }, { "epoch": 1.110344827586207, "grad_norm": 0.7163541913032532, "learning_rate": 0.00022645631067961164, "loss": 0.5062, "step": 121 }, { "epoch": 1.1195402298850574, "grad_norm": 0.6037282347679138, "learning_rate": 0.00022572815533980582, "loss": 0.4752, "step": 122 }, { "epoch": 1.1287356321839082, "grad_norm": 0.5334956049919128, "learning_rate": 0.000225, "loss": 0.5978, "step": 123 }, { "epoch": 1.1379310344827587, "grad_norm": 0.6316633224487305, "learning_rate": 0.00022427184466019415, "loss": 0.631, "step": 124 }, { "epoch": 1.1471264367816092, "grad_norm": 0.5869876742362976, "learning_rate": 0.00022354368932038832, "loss": 0.6083, "step": 125 }, { "epoch": 1.1563218390804597, "grad_norm": 0.8843062520027161, "learning_rate": 0.0002228155339805825, "loss": 0.5106, "step": 126 }, { "epoch": 1.1655172413793102, "grad_norm": 0.9645463228225708, "learning_rate": 0.00022208737864077668, "loss": 0.6352, "step": 127 }, { "epoch": 1.174712643678161, "grad_norm": 0.4931183457374573, "learning_rate": 0.00022135922330097086, "loss": 0.6054, "step": 128 }, { "epoch": 1.1839080459770115, "grad_norm": 0.5451965928077698, "learning_rate": 0.00022063106796116504, "loss": 0.6165, "step": 129 }, { "epoch": 1.193103448275862, "grad_norm": 0.44423708319664, "learning_rate": 0.00021990291262135922, "loss": 0.57, "step": 130 }, { "epoch": 1.2022988505747128, "grad_norm": 0.6561571955680847, "learning_rate": 0.00021917475728155337, "loss": 0.5627, "step": 131 }, { "epoch": 1.2114942528735633, "grad_norm": 0.4954622983932495, "learning_rate": 0.00021844660194174755, "loss": 0.6054, "step": 132 }, { "epoch": 1.2206896551724138, "grad_norm": 0.49996840953826904, "learning_rate": 0.00021771844660194173, "loss": 0.7044, "step": 133 }, { "epoch": 1.2298850574712643, "grad_norm": 0.6132878065109253, "learning_rate": 0.0002169902912621359, "loss": 0.5854, "step": 134 }, { "epoch": 1.2390804597701148, "grad_norm": 0.619061291217804, "learning_rate": 0.00021626213592233008, "loss": 0.7435, "step": 135 }, { "epoch": 1.2482758620689656, "grad_norm": 0.5794959664344788, "learning_rate": 0.00021553398058252426, "loss": 0.8223, "step": 136 }, { "epoch": 1.257471264367816, "grad_norm": 0.595934271812439, "learning_rate": 0.00021480582524271844, "loss": 0.4924, "step": 137 }, { "epoch": 1.2666666666666666, "grad_norm": 0.7381154298782349, "learning_rate": 0.00021407766990291262, "loss": 0.5804, "step": 138 }, { "epoch": 1.2758620689655173, "grad_norm": 0.6467525362968445, "learning_rate": 0.00021334951456310677, "loss": 0.6435, "step": 139 }, { "epoch": 1.2850574712643679, "grad_norm": 0.5511316657066345, "learning_rate": 0.00021262135922330095, "loss": 0.6627, "step": 140 }, { "epoch": 1.2942528735632184, "grad_norm": 0.6253796815872192, "learning_rate": 0.00021189320388349513, "loss": 0.7729, "step": 141 }, { "epoch": 1.303448275862069, "grad_norm": 0.4994090795516968, "learning_rate": 0.0002111650485436893, "loss": 0.4122, "step": 142 }, { "epoch": 1.3126436781609194, "grad_norm": 0.35405322909355164, "learning_rate": 0.00021043689320388349, "loss": 0.3711, "step": 143 }, { "epoch": 1.3218390804597702, "grad_norm": 0.5248880982398987, "learning_rate": 0.00020970873786407766, "loss": 0.6734, "step": 144 }, { "epoch": 1.3310344827586207, "grad_norm": 0.44895875453948975, "learning_rate": 0.00020898058252427184, "loss": 0.3438, "step": 145 }, { "epoch": 1.3402298850574712, "grad_norm": 0.4654625654220581, "learning_rate": 0.00020825242718446602, "loss": 0.4324, "step": 146 }, { "epoch": 1.349425287356322, "grad_norm": 0.4388936460018158, "learning_rate": 0.00020752427184466017, "loss": 0.4578, "step": 147 }, { "epoch": 1.3586206896551725, "grad_norm": 0.4960116744041443, "learning_rate": 0.00020679611650485435, "loss": 0.5942, "step": 148 }, { "epoch": 1.367816091954023, "grad_norm": 0.590185284614563, "learning_rate": 0.00020606796116504853, "loss": 0.4179, "step": 149 }, { "epoch": 1.3770114942528735, "grad_norm": 0.5173139572143555, "learning_rate": 0.0002053398058252427, "loss": 0.5081, "step": 150 }, { "epoch": 1.386206896551724, "grad_norm": 0.5537795424461365, "learning_rate": 0.0002046116504854369, "loss": 0.6486, "step": 151 }, { "epoch": 1.3954022988505748, "grad_norm": 0.593481183052063, "learning_rate": 0.00020388349514563107, "loss": 0.5944, "step": 152 }, { "epoch": 1.4045977011494253, "grad_norm": 0.5522420406341553, "learning_rate": 0.00020315533980582524, "loss": 0.8094, "step": 153 }, { "epoch": 1.4137931034482758, "grad_norm": 0.6627272963523865, "learning_rate": 0.00020242718446601942, "loss": 0.4802, "step": 154 }, { "epoch": 1.4229885057471265, "grad_norm": 0.3878915011882782, "learning_rate": 0.00020169902912621357, "loss": 0.4832, "step": 155 }, { "epoch": 1.432183908045977, "grad_norm": 0.5121778845787048, "learning_rate": 0.00020097087378640775, "loss": 0.6316, "step": 156 }, { "epoch": 1.4413793103448276, "grad_norm": 0.41354355216026306, "learning_rate": 0.00020024271844660193, "loss": 0.5819, "step": 157 }, { "epoch": 1.450574712643678, "grad_norm": 0.35058164596557617, "learning_rate": 0.0001995145631067961, "loss": 0.4422, "step": 158 }, { "epoch": 1.4597701149425286, "grad_norm": 0.5187242031097412, "learning_rate": 0.0001987864077669903, "loss": 0.5037, "step": 159 }, { "epoch": 1.4689655172413794, "grad_norm": 0.7884855270385742, "learning_rate": 0.00019805825242718447, "loss": 0.687, "step": 160 }, { "epoch": 1.4781609195402299, "grad_norm": 0.6305707693099976, "learning_rate": 0.00019733009708737865, "loss": 0.6576, "step": 161 }, { "epoch": 1.4873563218390804, "grad_norm": 0.5323311686515808, "learning_rate": 0.0001966019417475728, "loss": 0.5083, "step": 162 }, { "epoch": 1.4965517241379311, "grad_norm": 0.4653192162513733, "learning_rate": 0.00019587378640776698, "loss": 0.4232, "step": 163 }, { "epoch": 1.5057471264367817, "grad_norm": 0.42282363772392273, "learning_rate": 0.00019514563106796116, "loss": 0.46, "step": 164 }, { "epoch": 1.5149425287356322, "grad_norm": 0.5382373929023743, "learning_rate": 0.00019441747572815533, "loss": 0.6501, "step": 165 }, { "epoch": 1.524137931034483, "grad_norm": 0.49478620290756226, "learning_rate": 0.0001936893203883495, "loss": 0.6337, "step": 166 }, { "epoch": 1.5333333333333332, "grad_norm": 0.3653823435306549, "learning_rate": 0.0001929611650485437, "loss": 0.5887, "step": 167 }, { "epoch": 1.542528735632184, "grad_norm": 0.8347523808479309, "learning_rate": 0.00019223300970873787, "loss": 0.5192, "step": 168 }, { "epoch": 1.5517241379310345, "grad_norm": 0.559551477432251, "learning_rate": 0.00019150485436893205, "loss": 0.3474, "step": 169 }, { "epoch": 1.560919540229885, "grad_norm": 0.42149192094802856, "learning_rate": 0.0001907766990291262, "loss": 0.407, "step": 170 }, { "epoch": 1.5701149425287357, "grad_norm": 0.5364254117012024, "learning_rate": 0.00019004854368932038, "loss": 0.7923, "step": 171 }, { "epoch": 1.5793103448275863, "grad_norm": 0.5137253999710083, "learning_rate": 0.00018932038834951456, "loss": 0.5499, "step": 172 }, { "epoch": 1.5885057471264368, "grad_norm": 0.5608237981796265, "learning_rate": 0.00018859223300970874, "loss": 0.5724, "step": 173 }, { "epoch": 1.5977011494252875, "grad_norm": 0.34653526544570923, "learning_rate": 0.00018786407766990291, "loss": 0.3985, "step": 174 }, { "epoch": 1.6068965517241378, "grad_norm": 0.6667991876602173, "learning_rate": 0.00018713592233009707, "loss": 0.6257, "step": 175 }, { "epoch": 1.6160919540229886, "grad_norm": 0.5803218483924866, "learning_rate": 0.00018640776699029122, "loss": 0.7256, "step": 176 }, { "epoch": 1.625287356321839, "grad_norm": 0.3350389897823334, "learning_rate": 0.0001856796116504854, "loss": 0.3256, "step": 177 }, { "epoch": 1.6344827586206896, "grad_norm": 0.2988188862800598, "learning_rate": 0.00018495145631067957, "loss": 0.325, "step": 178 }, { "epoch": 1.6436781609195403, "grad_norm": 0.540225088596344, "learning_rate": 0.00018422330097087375, "loss": 0.7405, "step": 179 }, { "epoch": 1.6528735632183909, "grad_norm": 0.46998921036720276, "learning_rate": 0.00018349514563106793, "loss": 0.5965, "step": 180 }, { "epoch": 1.6620689655172414, "grad_norm": 0.42614537477493286, "learning_rate": 0.0001827669902912621, "loss": 0.5187, "step": 181 }, { "epoch": 1.6712643678160921, "grad_norm": 0.5146321058273315, "learning_rate": 0.0001820388349514563, "loss": 0.6134, "step": 182 }, { "epoch": 1.6804597701149424, "grad_norm": 0.5366286635398865, "learning_rate": 0.00018131067961165047, "loss": 0.4942, "step": 183 }, { "epoch": 1.6896551724137931, "grad_norm": 0.5794548392295837, "learning_rate": 0.00018058252427184462, "loss": 0.6949, "step": 184 }, { "epoch": 1.6988505747126437, "grad_norm": 0.4413406550884247, "learning_rate": 0.0001798543689320388, "loss": 0.5908, "step": 185 }, { "epoch": 1.7080459770114942, "grad_norm": 0.5551508069038391, "learning_rate": 0.00017912621359223298, "loss": 0.4899, "step": 186 }, { "epoch": 1.717241379310345, "grad_norm": 0.5020127892494202, "learning_rate": 0.00017839805825242716, "loss": 0.3787, "step": 187 }, { "epoch": 1.7264367816091954, "grad_norm": 0.5126776695251465, "learning_rate": 0.00017766990291262133, "loss": 0.4694, "step": 188 }, { "epoch": 1.735632183908046, "grad_norm": 0.5620916485786438, "learning_rate": 0.0001769417475728155, "loss": 0.5547, "step": 189 }, { "epoch": 1.7448275862068967, "grad_norm": 0.4924725294113159, "learning_rate": 0.0001762135922330097, "loss": 0.5656, "step": 190 }, { "epoch": 1.754022988505747, "grad_norm": 0.3647457957267761, "learning_rate": 0.00017548543689320387, "loss": 0.5137, "step": 191 }, { "epoch": 1.7632183908045977, "grad_norm": 0.4956059455871582, "learning_rate": 0.00017475728155339802, "loss": 0.6375, "step": 192 }, { "epoch": 1.7724137931034483, "grad_norm": 0.3766675591468811, "learning_rate": 0.0001740291262135922, "loss": 0.5081, "step": 193 }, { "epoch": 1.7816091954022988, "grad_norm": 0.3699629604816437, "learning_rate": 0.00017330097087378638, "loss": 0.3969, "step": 194 }, { "epoch": 1.7908045977011495, "grad_norm": 0.35047248005867004, "learning_rate": 0.00017257281553398056, "loss": 0.4473, "step": 195 }, { "epoch": 1.8, "grad_norm": 0.5037795901298523, "learning_rate": 0.00017184466019417474, "loss": 0.4882, "step": 196 }, { "epoch": 1.8091954022988506, "grad_norm": 0.46976998448371887, "learning_rate": 0.00017111650485436891, "loss": 0.5307, "step": 197 }, { "epoch": 1.8183908045977013, "grad_norm": 0.5248985290527344, "learning_rate": 0.0001703883495145631, "loss": 0.5062, "step": 198 }, { "epoch": 1.8275862068965516, "grad_norm": 0.4395950436592102, "learning_rate": 0.00016966019417475724, "loss": 0.5184, "step": 199 }, { "epoch": 1.8367816091954023, "grad_norm": 0.31456464529037476, "learning_rate": 0.00016893203883495142, "loss": 0.3707, "step": 200 }, { "epoch": 1.8459770114942529, "grad_norm": 0.39423462748527527, "learning_rate": 0.0001682038834951456, "loss": 0.3915, "step": 201 }, { "epoch": 1.8551724137931034, "grad_norm": 0.3807072937488556, "learning_rate": 0.00016747572815533978, "loss": 0.4203, "step": 202 }, { "epoch": 1.8643678160919541, "grad_norm": 0.49600493907928467, "learning_rate": 0.00016674757281553396, "loss": 0.5633, "step": 203 }, { "epoch": 1.8735632183908046, "grad_norm": 0.4472233057022095, "learning_rate": 0.00016601941747572814, "loss": 0.5791, "step": 204 }, { "epoch": 1.8827586206896552, "grad_norm": 0.48306939005851746, "learning_rate": 0.00016529126213592232, "loss": 0.4091, "step": 205 }, { "epoch": 1.891954022988506, "grad_norm": 0.40710389614105225, "learning_rate": 0.0001645631067961165, "loss": 0.5302, "step": 206 }, { "epoch": 1.9011494252873562, "grad_norm": 0.37706899642944336, "learning_rate": 0.00016383495145631065, "loss": 0.416, "step": 207 }, { "epoch": 1.910344827586207, "grad_norm": 0.5805254578590393, "learning_rate": 0.00016310679611650483, "loss": 0.6861, "step": 208 }, { "epoch": 1.9195402298850575, "grad_norm": 0.44542425870895386, "learning_rate": 0.000162378640776699, "loss": 0.5468, "step": 209 }, { "epoch": 1.928735632183908, "grad_norm": 0.44782838225364685, "learning_rate": 0.00016165048543689318, "loss": 0.4827, "step": 210 }, { "epoch": 1.9379310344827587, "grad_norm": 0.30875957012176514, "learning_rate": 0.00016092233009708736, "loss": 0.3472, "step": 211 }, { "epoch": 1.9471264367816092, "grad_norm": 0.4443942904472351, "learning_rate": 0.00016019417475728154, "loss": 0.5595, "step": 212 }, { "epoch": 1.9563218390804598, "grad_norm": 0.32429659366607666, "learning_rate": 0.00015946601941747572, "loss": 0.374, "step": 213 }, { "epoch": 1.9655172413793105, "grad_norm": 0.339242160320282, "learning_rate": 0.0001587378640776699, "loss": 0.4155, "step": 214 }, { "epoch": 1.9747126436781608, "grad_norm": 0.5646737813949585, "learning_rate": 0.00015800970873786405, "loss": 0.8037, "step": 215 }, { "epoch": 1.9839080459770115, "grad_norm": 0.5335783958435059, "learning_rate": 0.00015728155339805823, "loss": 0.7524, "step": 216 }, { "epoch": 1.993103448275862, "grad_norm": 0.46629971265792847, "learning_rate": 0.0001565533980582524, "loss": 0.598, "step": 217 }, { "epoch": 2.0, "grad_norm": 0.7031301856040955, "learning_rate": 0.00015582524271844658, "loss": 0.7793, "step": 218 }, { "epoch": 2.0, "eval_loss": 1.3836089372634888, "eval_runtime": 94.5395, "eval_samples_per_second": 3.512, "eval_steps_per_second": 1.756, "step": 218 }, { "epoch": 2.0091954022988507, "grad_norm": 0.3298664093017578, "learning_rate": 0.00015509708737864076, "loss": 0.3196, "step": 219 }, { "epoch": 2.018390804597701, "grad_norm": 0.4151831567287445, "learning_rate": 0.00015436893203883494, "loss": 0.4209, "step": 220 }, { "epoch": 2.027586206896552, "grad_norm": 0.35617533326148987, "learning_rate": 0.00015364077669902912, "loss": 0.4023, "step": 221 }, { "epoch": 2.036781609195402, "grad_norm": 0.41707131266593933, "learning_rate": 0.0001529126213592233, "loss": 0.4355, "step": 222 }, { "epoch": 2.045977011494253, "grad_norm": 0.3739112913608551, "learning_rate": 0.00015218446601941745, "loss": 0.3309, "step": 223 }, { "epoch": 2.0551724137931036, "grad_norm": 0.529603123664856, "learning_rate": 0.00015145631067961163, "loss": 0.3706, "step": 224 }, { "epoch": 2.064367816091954, "grad_norm": 0.4731467664241791, "learning_rate": 0.0001507281553398058, "loss": 0.379, "step": 225 }, { "epoch": 2.0735632183908046, "grad_norm": 0.5218953490257263, "learning_rate": 0.00015, "loss": 0.3467, "step": 226 }, { "epoch": 2.0827586206896553, "grad_norm": 0.7507036924362183, "learning_rate": 0.00014927184466019417, "loss": 0.6107, "step": 227 }, { "epoch": 2.0919540229885056, "grad_norm": 0.44099223613739014, "learning_rate": 0.00014854368932038834, "loss": 0.3186, "step": 228 }, { "epoch": 2.1011494252873564, "grad_norm": 0.4234969913959503, "learning_rate": 0.00014781553398058252, "loss": 0.3482, "step": 229 }, { "epoch": 2.110344827586207, "grad_norm": 0.49509719014167786, "learning_rate": 0.00014708737864077667, "loss": 0.3985, "step": 230 }, { "epoch": 2.1195402298850574, "grad_norm": 0.43987375497817993, "learning_rate": 0.00014635922330097085, "loss": 0.3618, "step": 231 }, { "epoch": 2.128735632183908, "grad_norm": 0.6162170767784119, "learning_rate": 0.00014563106796116503, "loss": 0.4423, "step": 232 }, { "epoch": 2.1379310344827585, "grad_norm": 0.3867994248867035, "learning_rate": 0.0001449029126213592, "loss": 0.3419, "step": 233 }, { "epoch": 2.147126436781609, "grad_norm": 0.7177544832229614, "learning_rate": 0.0001441747572815534, "loss": 0.4747, "step": 234 }, { "epoch": 2.15632183908046, "grad_norm": 0.5735111236572266, "learning_rate": 0.00014344660194174757, "loss": 0.4305, "step": 235 }, { "epoch": 2.1655172413793102, "grad_norm": 0.5527003407478333, "learning_rate": 0.00014271844660194175, "loss": 0.3601, "step": 236 }, { "epoch": 2.174712643678161, "grad_norm": 0.5799367427825928, "learning_rate": 0.00014199029126213592, "loss": 0.4681, "step": 237 }, { "epoch": 2.1839080459770113, "grad_norm": 0.5536375045776367, "learning_rate": 0.00014126213592233008, "loss": 0.4431, "step": 238 }, { "epoch": 2.193103448275862, "grad_norm": 0.45677146315574646, "learning_rate": 0.00014053398058252425, "loss": 0.3618, "step": 239 }, { "epoch": 2.2022988505747128, "grad_norm": 0.464231938123703, "learning_rate": 0.00013980582524271843, "loss": 0.3164, "step": 240 }, { "epoch": 2.211494252873563, "grad_norm": 0.7074082493782043, "learning_rate": 0.0001390776699029126, "loss": 0.5988, "step": 241 }, { "epoch": 2.220689655172414, "grad_norm": 0.4413163363933563, "learning_rate": 0.0001383495145631068, "loss": 0.2946, "step": 242 }, { "epoch": 2.2298850574712645, "grad_norm": 0.3606387972831726, "learning_rate": 0.00013762135922330097, "loss": 0.2709, "step": 243 }, { "epoch": 2.239080459770115, "grad_norm": 0.6036794185638428, "learning_rate": 0.00013689320388349515, "loss": 0.5043, "step": 244 }, { "epoch": 2.2482758620689656, "grad_norm": 0.7041867971420288, "learning_rate": 0.00013616504854368933, "loss": 0.6161, "step": 245 }, { "epoch": 2.2574712643678163, "grad_norm": 0.36813509464263916, "learning_rate": 0.00013543689320388348, "loss": 0.2538, "step": 246 }, { "epoch": 2.2666666666666666, "grad_norm": 0.6574187874794006, "learning_rate": 0.00013470873786407766, "loss": 0.5472, "step": 247 }, { "epoch": 2.2758620689655173, "grad_norm": 0.3584626317024231, "learning_rate": 0.00013398058252427184, "loss": 0.2486, "step": 248 }, { "epoch": 2.2850574712643676, "grad_norm": 0.7730153203010559, "learning_rate": 0.00013325242718446601, "loss": 0.4867, "step": 249 }, { "epoch": 2.2942528735632184, "grad_norm": 0.6601793766021729, "learning_rate": 0.0001325242718446602, "loss": 0.6158, "step": 250 }, { "epoch": 2.303448275862069, "grad_norm": 0.42591384053230286, "learning_rate": 0.00013179611650485437, "loss": 0.3484, "step": 251 }, { "epoch": 2.3126436781609194, "grad_norm": 0.3791221082210541, "learning_rate": 0.00013106796116504852, "loss": 0.2861, "step": 252 }, { "epoch": 2.32183908045977, "grad_norm": 0.5107592940330505, "learning_rate": 0.0001303398058252427, "loss": 0.4219, "step": 253 }, { "epoch": 2.3310344827586205, "grad_norm": 0.43515774607658386, "learning_rate": 0.00012961165048543688, "loss": 0.32, "step": 254 }, { "epoch": 2.340229885057471, "grad_norm": 0.45249196887016296, "learning_rate": 0.00012888349514563106, "loss": 0.3144, "step": 255 }, { "epoch": 2.349425287356322, "grad_norm": 0.5824540853500366, "learning_rate": 0.00012815533980582524, "loss": 0.4423, "step": 256 }, { "epoch": 2.3586206896551722, "grad_norm": 0.680497407913208, "learning_rate": 0.0001274271844660194, "loss": 0.579, "step": 257 }, { "epoch": 2.367816091954023, "grad_norm": 0.45555397868156433, "learning_rate": 0.00012669902912621357, "loss": 0.3296, "step": 258 }, { "epoch": 2.3770114942528737, "grad_norm": 0.5558891892433167, "learning_rate": 0.00012597087378640775, "loss": 0.4183, "step": 259 }, { "epoch": 2.386206896551724, "grad_norm": 0.6155978441238403, "learning_rate": 0.00012524271844660192, "loss": 0.4989, "step": 260 }, { "epoch": 2.3954022988505748, "grad_norm": 0.5432369709014893, "learning_rate": 0.0001245145631067961, "loss": 0.4149, "step": 261 }, { "epoch": 2.4045977011494255, "grad_norm": 0.3622282147407532, "learning_rate": 0.00012378640776699028, "loss": 0.2948, "step": 262 }, { "epoch": 2.413793103448276, "grad_norm": 0.42852282524108887, "learning_rate": 0.00012305825242718446, "loss": 0.2617, "step": 263 }, { "epoch": 2.4229885057471265, "grad_norm": 0.3979598879814148, "learning_rate": 0.0001223300970873786, "loss": 0.221, "step": 264 }, { "epoch": 2.432183908045977, "grad_norm": 0.4943128526210785, "learning_rate": 0.0001216019417475728, "loss": 0.31, "step": 265 }, { "epoch": 2.4413793103448276, "grad_norm": 0.5944445133209229, "learning_rate": 0.00012087378640776697, "loss": 0.5284, "step": 266 }, { "epoch": 2.4505747126436783, "grad_norm": 0.5441230535507202, "learning_rate": 0.00012014563106796115, "loss": 0.4091, "step": 267 }, { "epoch": 2.4597701149425286, "grad_norm": 0.3592386543750763, "learning_rate": 0.00011941747572815533, "loss": 0.2881, "step": 268 }, { "epoch": 2.4689655172413794, "grad_norm": 0.4022001028060913, "learning_rate": 0.0001186893203883495, "loss": 0.3005, "step": 269 }, { "epoch": 2.4781609195402297, "grad_norm": 0.3709545433521271, "learning_rate": 0.00011796116504854367, "loss": 0.261, "step": 270 }, { "epoch": 2.4873563218390804, "grad_norm": 0.4119846522808075, "learning_rate": 0.00011723300970873785, "loss": 0.284, "step": 271 }, { "epoch": 2.496551724137931, "grad_norm": 0.5992346405982971, "learning_rate": 0.00011650485436893203, "loss": 0.6485, "step": 272 }, { "epoch": 2.5057471264367814, "grad_norm": 0.3555474579334259, "learning_rate": 0.0001157766990291262, "loss": 0.2032, "step": 273 }, { "epoch": 2.514942528735632, "grad_norm": 0.40004169940948486, "learning_rate": 0.00011504854368932037, "loss": 0.3559, "step": 274 }, { "epoch": 2.524137931034483, "grad_norm": 0.5196751356124878, "learning_rate": 0.00011432038834951455, "loss": 0.3952, "step": 275 }, { "epoch": 2.533333333333333, "grad_norm": 0.2508908808231354, "learning_rate": 0.00011359223300970873, "loss": 0.1867, "step": 276 }, { "epoch": 2.542528735632184, "grad_norm": 0.5024006366729736, "learning_rate": 0.00011286407766990291, "loss": 0.4146, "step": 277 }, { "epoch": 2.5517241379310347, "grad_norm": 0.48392558097839355, "learning_rate": 0.00011213592233009707, "loss": 0.404, "step": 278 }, { "epoch": 2.560919540229885, "grad_norm": 0.3421654999256134, "learning_rate": 0.00011140776699029125, "loss": 0.2727, "step": 279 }, { "epoch": 2.5701149425287357, "grad_norm": 0.32812729477882385, "learning_rate": 0.00011067961165048543, "loss": 0.2466, "step": 280 }, { "epoch": 2.5793103448275865, "grad_norm": 0.3916667401790619, "learning_rate": 0.00010995145631067961, "loss": 0.2962, "step": 281 }, { "epoch": 2.5885057471264368, "grad_norm": 0.41674014925956726, "learning_rate": 0.00010922330097087377, "loss": 0.3091, "step": 282 }, { "epoch": 2.5977011494252875, "grad_norm": 0.5579612851142883, "learning_rate": 0.00010849514563106795, "loss": 0.4515, "step": 283 }, { "epoch": 2.606896551724138, "grad_norm": 0.31362155079841614, "learning_rate": 0.00010776699029126213, "loss": 0.209, "step": 284 }, { "epoch": 2.6160919540229886, "grad_norm": 0.3783795237541199, "learning_rate": 0.00010703883495145631, "loss": 0.2683, "step": 285 }, { "epoch": 2.625287356321839, "grad_norm": 0.4127572178840637, "learning_rate": 0.00010631067961165047, "loss": 0.3164, "step": 286 }, { "epoch": 2.6344827586206896, "grad_norm": 0.42433518171310425, "learning_rate": 0.00010558252427184465, "loss": 0.2813, "step": 287 }, { "epoch": 2.6436781609195403, "grad_norm": 0.26535582542419434, "learning_rate": 0.00010485436893203883, "loss": 0.1638, "step": 288 }, { "epoch": 2.6528735632183906, "grad_norm": 0.47647133469581604, "learning_rate": 0.00010412621359223301, "loss": 0.3998, "step": 289 }, { "epoch": 2.6620689655172414, "grad_norm": 0.4837816059589386, "learning_rate": 0.00010339805825242718, "loss": 0.3634, "step": 290 }, { "epoch": 2.671264367816092, "grad_norm": 0.5139144062995911, "learning_rate": 0.00010266990291262135, "loss": 0.4553, "step": 291 }, { "epoch": 2.6804597701149424, "grad_norm": 0.42559128999710083, "learning_rate": 0.00010194174757281553, "loss": 0.2803, "step": 292 }, { "epoch": 2.689655172413793, "grad_norm": 0.5253325700759888, "learning_rate": 0.00010121359223300971, "loss": 0.4476, "step": 293 }, { "epoch": 2.698850574712644, "grad_norm": 0.42015260457992554, "learning_rate": 0.00010048543689320388, "loss": 0.3619, "step": 294 }, { "epoch": 2.708045977011494, "grad_norm": 0.7167945504188538, "learning_rate": 9.975728155339806e-05, "loss": 0.6277, "step": 295 }, { "epoch": 2.717241379310345, "grad_norm": 0.596862256526947, "learning_rate": 9.902912621359223e-05, "loss": 0.454, "step": 296 }, { "epoch": 2.7264367816091957, "grad_norm": 0.391558974981308, "learning_rate": 9.83009708737864e-05, "loss": 0.3283, "step": 297 }, { "epoch": 2.735632183908046, "grad_norm": 0.4755273759365082, "learning_rate": 9.757281553398058e-05, "loss": 0.4141, "step": 298 }, { "epoch": 2.7448275862068967, "grad_norm": 0.37474825978279114, "learning_rate": 9.684466019417476e-05, "loss": 0.285, "step": 299 }, { "epoch": 2.754022988505747, "grad_norm": 0.47797009348869324, "learning_rate": 9.611650485436893e-05, "loss": 0.3485, "step": 300 }, { "epoch": 2.7632183908045977, "grad_norm": 0.4562835395336151, "learning_rate": 9.53883495145631e-05, "loss": 0.3423, "step": 301 }, { "epoch": 2.772413793103448, "grad_norm": 0.6650474667549133, "learning_rate": 9.466019417475728e-05, "loss": 0.4271, "step": 302 }, { "epoch": 2.781609195402299, "grad_norm": 0.4036124050617218, "learning_rate": 9.393203883495146e-05, "loss": 0.2785, "step": 303 }, { "epoch": 2.7908045977011495, "grad_norm": 0.6422847509384155, "learning_rate": 9.320388349514561e-05, "loss": 0.4546, "step": 304 }, { "epoch": 2.8, "grad_norm": 0.4403219521045685, "learning_rate": 9.247572815533979e-05, "loss": 0.2975, "step": 305 }, { "epoch": 2.8091954022988506, "grad_norm": 0.34099337458610535, "learning_rate": 9.174757281553397e-05, "loss": 0.2092, "step": 306 }, { "epoch": 2.8183908045977013, "grad_norm": 0.6565693020820618, "learning_rate": 9.101941747572814e-05, "loss": 0.589, "step": 307 }, { "epoch": 2.8275862068965516, "grad_norm": 0.29657208919525146, "learning_rate": 9.029126213592231e-05, "loss": 0.2206, "step": 308 }, { "epoch": 2.8367816091954023, "grad_norm": 0.35843998193740845, "learning_rate": 8.956310679611649e-05, "loss": 0.2576, "step": 309 }, { "epoch": 2.845977011494253, "grad_norm": 0.39961719512939453, "learning_rate": 8.883495145631067e-05, "loss": 0.2589, "step": 310 }, { "epoch": 2.8551724137931034, "grad_norm": 0.2650497257709503, "learning_rate": 8.810679611650485e-05, "loss": 0.1499, "step": 311 }, { "epoch": 2.864367816091954, "grad_norm": 0.6040882468223572, "learning_rate": 8.737864077669901e-05, "loss": 0.5003, "step": 312 }, { "epoch": 2.873563218390805, "grad_norm": 0.36157143115997314, "learning_rate": 8.665048543689319e-05, "loss": 0.3019, "step": 313 }, { "epoch": 2.882758620689655, "grad_norm": 0.34061622619628906, "learning_rate": 8.592233009708737e-05, "loss": 0.26, "step": 314 }, { "epoch": 2.891954022988506, "grad_norm": 0.6060124635696411, "learning_rate": 8.519417475728155e-05, "loss": 0.5343, "step": 315 }, { "epoch": 2.901149425287356, "grad_norm": 0.536323070526123, "learning_rate": 8.446601941747571e-05, "loss": 0.3762, "step": 316 }, { "epoch": 2.910344827586207, "grad_norm": 0.6887696981430054, "learning_rate": 8.373786407766989e-05, "loss": 0.613, "step": 317 }, { "epoch": 2.9195402298850572, "grad_norm": 0.3895469009876251, "learning_rate": 8.300970873786407e-05, "loss": 0.2919, "step": 318 }, { "epoch": 2.928735632183908, "grad_norm": 0.43969079852104187, "learning_rate": 8.228155339805825e-05, "loss": 0.2747, "step": 319 }, { "epoch": 2.9379310344827587, "grad_norm": 0.6681087017059326, "learning_rate": 8.155339805825241e-05, "loss": 0.5428, "step": 320 }, { "epoch": 2.947126436781609, "grad_norm": 0.5340582132339478, "learning_rate": 8.082524271844659e-05, "loss": 0.3636, "step": 321 }, { "epoch": 2.9563218390804598, "grad_norm": 0.4719969630241394, "learning_rate": 8.009708737864077e-05, "loss": 0.3629, "step": 322 }, { "epoch": 2.9655172413793105, "grad_norm": 0.4452765882015228, "learning_rate": 7.936893203883495e-05, "loss": 0.3002, "step": 323 }, { "epoch": 2.974712643678161, "grad_norm": 0.5446481704711914, "learning_rate": 7.864077669902911e-05, "loss": 0.426, "step": 324 }, { "epoch": 2.9839080459770115, "grad_norm": 0.6985622048377991, "learning_rate": 7.791262135922329e-05, "loss": 0.467, "step": 325 }, { "epoch": 2.9931034482758623, "grad_norm": 0.6391457915306091, "learning_rate": 7.718446601941747e-05, "loss": 0.428, "step": 326 }, { "epoch": 3.0, "grad_norm": 0.7811146378517151, "learning_rate": 7.645631067961165e-05, "loss": 0.5441, "step": 327 }, { "epoch": 3.0, "eval_loss": 1.4894534349441528, "eval_runtime": 94.563, "eval_samples_per_second": 3.511, "eval_steps_per_second": 1.755, "step": 327 } ], "logging_steps": 1, "max_steps": 432, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.99509837713236e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }