{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0007067137809187, "eval_steps": 332, "global_step": 1327, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007538280329799764, "grad_norm": 16.65915870666504, "learning_rate": 2e-05, "loss": 0.9643, "step": 1 }, { "epoch": 0.0015076560659599528, "grad_norm": 18.07321548461914, "learning_rate": 4e-05, "loss": 0.99, "step": 2 }, { "epoch": 0.0022614840989399294, "grad_norm": 20.50757598876953, "learning_rate": 6e-05, "loss": 0.9625, "step": 3 }, { "epoch": 0.0030153121319199056, "grad_norm": 16.542922973632812, "learning_rate": 8e-05, "loss": 0.9352, "step": 4 }, { "epoch": 0.003769140164899882, "grad_norm": 16.435678482055664, "learning_rate": 0.0001, "loss": 0.9104, "step": 5 }, { "epoch": 0.004522968197879859, "grad_norm": 17.327836990356445, "learning_rate": 0.00012, "loss": 0.8723, "step": 6 }, { "epoch": 0.005276796230859835, "grad_norm": 15.497602462768555, "learning_rate": 0.00014, "loss": 0.7953, "step": 7 }, { "epoch": 0.006030624263839811, "grad_norm": 16.634872436523438, "learning_rate": 0.00016, "loss": 0.7444, "step": 8 }, { "epoch": 0.006784452296819788, "grad_norm": 18.223051071166992, "learning_rate": 0.00018, "loss": 0.7296, "step": 9 }, { "epoch": 0.007538280329799764, "grad_norm": 12.145986557006836, "learning_rate": 0.0002, "loss": 0.6559, "step": 10 }, { "epoch": 0.008292108362779741, "grad_norm": 12.613237380981445, "learning_rate": 0.00019999971548969982, "loss": 0.6663, "step": 11 }, { "epoch": 0.009045936395759718, "grad_norm": 12.572402954101562, "learning_rate": 0.0001999988619604182, "loss": 0.6575, "step": 12 }, { "epoch": 0.009799764428739694, "grad_norm": 10.410853385925293, "learning_rate": 0.00019999743941701188, "loss": 0.6172, "step": 13 }, { "epoch": 0.01055359246171967, "grad_norm": 9.993011474609375, "learning_rate": 0.00019999544786757545, "loss": 0.6205, "step": 14 }, { "epoch": 0.011307420494699646, "grad_norm": 10.803098678588867, "learning_rate": 0.00019999288732344122, "loss": 0.5932, "step": 15 }, { "epoch": 0.012061248527679622, "grad_norm": 10.612732887268066, "learning_rate": 0.0001999897577991792, "loss": 0.5989, "step": 16 }, { "epoch": 0.0128150765606596, "grad_norm": 10.924768447875977, "learning_rate": 0.0001999860593125971, "loss": 0.5946, "step": 17 }, { "epoch": 0.013568904593639576, "grad_norm": 9.804558753967285, "learning_rate": 0.00019998179188473997, "loss": 0.5321, "step": 18 }, { "epoch": 0.014322732626619553, "grad_norm": 10.648846626281738, "learning_rate": 0.00019997695553989042, "loss": 0.5584, "step": 19 }, { "epoch": 0.015076560659599529, "grad_norm": 10.692992210388184, "learning_rate": 0.00019997155030556822, "loss": 0.5603, "step": 20 }, { "epoch": 0.015830388692579505, "grad_norm": 10.715287208557129, "learning_rate": 0.00019996557621253027, "loss": 0.584, "step": 21 }, { "epoch": 0.016584216725559483, "grad_norm": 9.866957664489746, "learning_rate": 0.0001999590332947704, "loss": 0.5571, "step": 22 }, { "epoch": 0.017338044758539457, "grad_norm": 9.68693733215332, "learning_rate": 0.00019995192158951919, "loss": 0.5415, "step": 23 }, { "epoch": 0.018091872791519435, "grad_norm": 10.831818580627441, "learning_rate": 0.00019994424113724363, "loss": 0.5458, "step": 24 }, { "epoch": 0.01884570082449941, "grad_norm": 9.90411376953125, "learning_rate": 0.00019993599198164715, "loss": 0.5368, "step": 25 }, { "epoch": 0.019599528857479388, "grad_norm": 8.305344581604004, "learning_rate": 0.0001999271741696691, "loss": 0.5271, "step": 26 }, { "epoch": 0.020353356890459365, "grad_norm": 9.10693645477295, "learning_rate": 0.00019991778775148465, "loss": 0.5038, "step": 27 }, { "epoch": 0.02110718492343934, "grad_norm": 8.622981071472168, "learning_rate": 0.00019990783278050448, "loss": 0.5039, "step": 28 }, { "epoch": 0.021861012956419318, "grad_norm": 10.21834945678711, "learning_rate": 0.0001998973093133744, "loss": 0.5499, "step": 29 }, { "epoch": 0.022614840989399292, "grad_norm": 10.313283920288086, "learning_rate": 0.00019988621740997512, "loss": 0.5042, "step": 30 }, { "epoch": 0.02336866902237927, "grad_norm": 8.889609336853027, "learning_rate": 0.00019987455713342187, "loss": 0.467, "step": 31 }, { "epoch": 0.024122497055359245, "grad_norm": 8.749794006347656, "learning_rate": 0.000199862328550064, "loss": 0.5357, "step": 32 }, { "epoch": 0.024876325088339223, "grad_norm": 8.97386360168457, "learning_rate": 0.00019984953172948465, "loss": 0.495, "step": 33 }, { "epoch": 0.0256301531213192, "grad_norm": 9.057605743408203, "learning_rate": 0.0001998361667445004, "loss": 0.487, "step": 34 }, { "epoch": 0.026383981154299175, "grad_norm": 9.346535682678223, "learning_rate": 0.00019982223367116076, "loss": 0.5348, "step": 35 }, { "epoch": 0.027137809187279153, "grad_norm": 10.248679161071777, "learning_rate": 0.00019980773258874778, "loss": 0.5234, "step": 36 }, { "epoch": 0.027891637220259127, "grad_norm": 9.637868881225586, "learning_rate": 0.00019979266357977564, "loss": 0.5, "step": 37 }, { "epoch": 0.028645465253239105, "grad_norm": 10.087867736816406, "learning_rate": 0.00019977702672999007, "loss": 0.4772, "step": 38 }, { "epoch": 0.02939929328621908, "grad_norm": 11.312880516052246, "learning_rate": 0.00019976082212836793, "loss": 0.4689, "step": 39 }, { "epoch": 0.030153121319199058, "grad_norm": 10.71940803527832, "learning_rate": 0.0001997440498671168, "loss": 0.4961, "step": 40 }, { "epoch": 0.030906949352179035, "grad_norm": 10.881596565246582, "learning_rate": 0.00019972671004167433, "loss": 0.5628, "step": 41 }, { "epoch": 0.03166077738515901, "grad_norm": 11.740187644958496, "learning_rate": 0.00019970880275070762, "loss": 0.5096, "step": 42 }, { "epoch": 0.03241460541813899, "grad_norm": 12.402807235717773, "learning_rate": 0.00019969032809611287, "loss": 0.6114, "step": 43 }, { "epoch": 0.033168433451118966, "grad_norm": 13.945646286010742, "learning_rate": 0.0001996712861830147, "loss": 0.6009, "step": 44 }, { "epoch": 0.03392226148409894, "grad_norm": 14.720582962036133, "learning_rate": 0.00019965167711976552, "loss": 0.5767, "step": 45 }, { "epoch": 0.034676089517078915, "grad_norm": 15.46834945678711, "learning_rate": 0.0001996315010179449, "loss": 0.5566, "step": 46 }, { "epoch": 0.03542991755005889, "grad_norm": 16.198299407958984, "learning_rate": 0.00019961075799235903, "loss": 0.6248, "step": 47 }, { "epoch": 0.03618374558303887, "grad_norm": 36.07707214355469, "learning_rate": 0.00019958944816104, "loss": 0.5949, "step": 48 }, { "epoch": 0.03693757361601885, "grad_norm": 25.19093894958496, "learning_rate": 0.00019956757164524516, "loss": 0.5619, "step": 49 }, { "epoch": 0.03769140164899882, "grad_norm": 20.562816619873047, "learning_rate": 0.00019954512856945632, "loss": 0.662, "step": 50 }, { "epoch": 0.0384452296819788, "grad_norm": 40.9443359375, "learning_rate": 0.00019952211906137932, "loss": 0.8705, "step": 51 }, { "epoch": 0.039199057714958775, "grad_norm": 29.929576873779297, "learning_rate": 0.00019949854325194294, "loss": 0.7504, "step": 52 }, { "epoch": 0.03995288574793875, "grad_norm": 18.468313217163086, "learning_rate": 0.00019947440127529836, "loss": 0.7158, "step": 53 }, { "epoch": 0.04070671378091873, "grad_norm": 9.704629898071289, "learning_rate": 0.00019944969326881845, "loss": 0.5938, "step": 54 }, { "epoch": 0.0414605418138987, "grad_norm": 9.891565322875977, "learning_rate": 0.00019942441937309684, "loss": 0.5693, "step": 55 }, { "epoch": 0.04221436984687868, "grad_norm": 11.08341121673584, "learning_rate": 0.00019939857973194717, "loss": 0.5726, "step": 56 }, { "epoch": 0.04296819787985866, "grad_norm": 10.182625770568848, "learning_rate": 0.0001993721744924024, "loss": 0.5854, "step": 57 }, { "epoch": 0.043722025912838636, "grad_norm": 10.336113929748535, "learning_rate": 0.00019934520380471372, "loss": 0.5341, "step": 58 }, { "epoch": 0.04447585394581861, "grad_norm": 7.881448745727539, "learning_rate": 0.0001993176678223499, "loss": 0.5013, "step": 59 }, { "epoch": 0.045229681978798585, "grad_norm": 10.487141609191895, "learning_rate": 0.0001992895667019964, "loss": 0.5395, "step": 60 }, { "epoch": 0.04598351001177856, "grad_norm": 10.38466739654541, "learning_rate": 0.0001992609006035543, "loss": 0.5214, "step": 61 }, { "epoch": 0.04673733804475854, "grad_norm": 9.553030014038086, "learning_rate": 0.0001992316696901397, "loss": 0.4825, "step": 62 }, { "epoch": 0.04749116607773852, "grad_norm": 8.298136711120605, "learning_rate": 0.00019920187412808248, "loss": 0.4797, "step": 63 }, { "epoch": 0.04824499411071849, "grad_norm": 7.880730628967285, "learning_rate": 0.0001991715140869255, "loss": 0.4722, "step": 64 }, { "epoch": 0.04899882214369847, "grad_norm": 8.825959205627441, "learning_rate": 0.00019914058973942368, "loss": 0.5022, "step": 65 }, { "epoch": 0.049752650176678445, "grad_norm": 8.143362998962402, "learning_rate": 0.00019910910126154293, "loss": 0.4975, "step": 66 }, { "epoch": 0.05050647820965842, "grad_norm": 7.704590797424316, "learning_rate": 0.00019907704883245916, "loss": 0.4973, "step": 67 }, { "epoch": 0.0512603062426384, "grad_norm": 7.914122104644775, "learning_rate": 0.00019904443263455728, "loss": 0.5046, "step": 68 }, { "epoch": 0.05201413427561837, "grad_norm": 8.946449279785156, "learning_rate": 0.00019901125285343022, "loss": 0.5124, "step": 69 }, { "epoch": 0.05276796230859835, "grad_norm": 7.793578147888184, "learning_rate": 0.0001989775096778777, "loss": 0.5141, "step": 70 }, { "epoch": 0.05352179034157833, "grad_norm": 7.8742756843566895, "learning_rate": 0.0001989432032999054, "loss": 0.5071, "step": 71 }, { "epoch": 0.054275618374558306, "grad_norm": 8.052972793579102, "learning_rate": 0.0001989083339147237, "loss": 0.4938, "step": 72 }, { "epoch": 0.055029446407538284, "grad_norm": 7.386316776275635, "learning_rate": 0.0001988729017207465, "loss": 0.4767, "step": 73 }, { "epoch": 0.055783274440518255, "grad_norm": 7.526272773742676, "learning_rate": 0.00019883690691959035, "loss": 0.4642, "step": 74 }, { "epoch": 0.05653710247349823, "grad_norm": 8.339061737060547, "learning_rate": 0.00019880034971607308, "loss": 0.4888, "step": 75 }, { "epoch": 0.05729093050647821, "grad_norm": 8.045515060424805, "learning_rate": 0.00019876323031821266, "loss": 0.4478, "step": 76 }, { "epoch": 0.05804475853945819, "grad_norm": 8.333029747009277, "learning_rate": 0.00019872554893722618, "loss": 0.4695, "step": 77 }, { "epoch": 0.05879858657243816, "grad_norm": 8.050617218017578, "learning_rate": 0.0001986873057875284, "loss": 0.4532, "step": 78 }, { "epoch": 0.05955241460541814, "grad_norm": 8.27062702178955, "learning_rate": 0.00019864850108673073, "loss": 0.4654, "step": 79 }, { "epoch": 0.060306242638398115, "grad_norm": 8.429513931274414, "learning_rate": 0.0001986091350556399, "loss": 0.4829, "step": 80 }, { "epoch": 0.06106007067137809, "grad_norm": 8.401616096496582, "learning_rate": 0.00019856920791825683, "loss": 0.5086, "step": 81 }, { "epoch": 0.06181389870435807, "grad_norm": 8.308648109436035, "learning_rate": 0.00019852871990177503, "loss": 0.4758, "step": 82 }, { "epoch": 0.06256772673733804, "grad_norm": 8.516093254089355, "learning_rate": 0.00019848767123657976, "loss": 0.4423, "step": 83 }, { "epoch": 0.06332155477031802, "grad_norm": 8.437211990356445, "learning_rate": 0.0001984460621562463, "loss": 0.4429, "step": 84 }, { "epoch": 0.064075382803298, "grad_norm": 8.637296676635742, "learning_rate": 0.00019840389289753896, "loss": 0.4956, "step": 85 }, { "epoch": 0.06482921083627798, "grad_norm": 8.39278507232666, "learning_rate": 0.00019836116370040944, "loss": 0.4483, "step": 86 }, { "epoch": 0.06558303886925795, "grad_norm": 9.617965698242188, "learning_rate": 0.00019831787480799568, "loss": 0.4714, "step": 87 }, { "epoch": 0.06633686690223793, "grad_norm": 8.52342700958252, "learning_rate": 0.00019827402646662047, "loss": 0.4375, "step": 88 }, { "epoch": 0.06709069493521791, "grad_norm": 9.882357597351074, "learning_rate": 0.0001982296189257898, "loss": 0.4796, "step": 89 }, { "epoch": 0.06784452296819787, "grad_norm": 9.361654281616211, "learning_rate": 0.00019818465243819184, "loss": 0.4871, "step": 90 }, { "epoch": 0.06859835100117785, "grad_norm": 9.959556579589844, "learning_rate": 0.00019813912725969509, "loss": 0.472, "step": 91 }, { "epoch": 0.06935217903415783, "grad_norm": 9.579131126403809, "learning_rate": 0.0001980930436493472, "loss": 0.4906, "step": 92 }, { "epoch": 0.07010600706713781, "grad_norm": 10.082910537719727, "learning_rate": 0.00019804640186937343, "loss": 0.537, "step": 93 }, { "epoch": 0.07085983510011779, "grad_norm": 10.720930099487305, "learning_rate": 0.0001979992021851751, "loss": 0.5277, "step": 94 }, { "epoch": 0.07161366313309776, "grad_norm": 10.86539363861084, "learning_rate": 0.00019795144486532814, "loss": 0.5511, "step": 95 }, { "epoch": 0.07236749116607774, "grad_norm": 13.410208702087402, "learning_rate": 0.00019790313018158156, "loss": 0.5658, "step": 96 }, { "epoch": 0.07312131919905772, "grad_norm": 14.898797988891602, "learning_rate": 0.0001978542584088558, "loss": 0.5529, "step": 97 }, { "epoch": 0.0738751472320377, "grad_norm": 14.036768913269043, "learning_rate": 0.00019780482982524142, "loss": 0.5396, "step": 98 }, { "epoch": 0.07462897526501767, "grad_norm": 15.9882173538208, "learning_rate": 0.00019775484471199715, "loss": 0.514, "step": 99 }, { "epoch": 0.07538280329799764, "grad_norm": 17.01093864440918, "learning_rate": 0.0001977043033535486, "loss": 0.5262, "step": 100 }, { "epoch": 0.07613663133097762, "grad_norm": 41.135196685791016, "learning_rate": 0.00019765320603748655, "loss": 0.7909, "step": 101 }, { "epoch": 0.0768904593639576, "grad_norm": 25.291397094726562, "learning_rate": 0.0001976015530545652, "loss": 0.714, "step": 102 }, { "epoch": 0.07764428739693757, "grad_norm": 12.169105529785156, "learning_rate": 0.0001975493446987007, "loss": 0.5999, "step": 103 }, { "epoch": 0.07839811542991755, "grad_norm": 8.400662422180176, "learning_rate": 0.00019749658126696934, "loss": 0.5707, "step": 104 }, { "epoch": 0.07915194346289753, "grad_norm": 10.622336387634277, "learning_rate": 0.00019744326305960595, "loss": 0.5798, "step": 105 }, { "epoch": 0.0799057714958775, "grad_norm": 10.29685115814209, "learning_rate": 0.00019738939038000205, "loss": 0.5752, "step": 106 }, { "epoch": 0.08065959952885748, "grad_norm": 7.853797435760498, "learning_rate": 0.00019733496353470433, "loss": 0.543, "step": 107 }, { "epoch": 0.08141342756183746, "grad_norm": 7.910231113433838, "learning_rate": 0.00019727998283341274, "loss": 0.5155, "step": 108 }, { "epoch": 0.08216725559481743, "grad_norm": 8.53306770324707, "learning_rate": 0.00019722444858897878, "loss": 0.5029, "step": 109 }, { "epoch": 0.0829210836277974, "grad_norm": 8.579912185668945, "learning_rate": 0.00019716836111740378, "loss": 0.487, "step": 110 }, { "epoch": 0.08367491166077738, "grad_norm": 8.553475379943848, "learning_rate": 0.00019711172073783696, "loss": 0.4853, "step": 111 }, { "epoch": 0.08442873969375736, "grad_norm": 9.386043548583984, "learning_rate": 0.00019705452777257377, "loss": 0.4941, "step": 112 }, { "epoch": 0.08518256772673734, "grad_norm": 8.391158103942871, "learning_rate": 0.000196996782547054, "loss": 0.4657, "step": 113 }, { "epoch": 0.08593639575971732, "grad_norm": 8.52602481842041, "learning_rate": 0.00019693848538985983, "loss": 0.4744, "step": 114 }, { "epoch": 0.0866902237926973, "grad_norm": 7.8026885986328125, "learning_rate": 0.00019687963663271409, "loss": 0.4742, "step": 115 }, { "epoch": 0.08744405182567727, "grad_norm": 8.957297325134277, "learning_rate": 0.00019682023661047836, "loss": 0.4846, "step": 116 }, { "epoch": 0.08819787985865725, "grad_norm": 8.33506965637207, "learning_rate": 0.00019676028566115102, "loss": 0.47, "step": 117 }, { "epoch": 0.08895170789163721, "grad_norm": 7.720737934112549, "learning_rate": 0.00019669978412586528, "loss": 0.4512, "step": 118 }, { "epoch": 0.08970553592461719, "grad_norm": 7.069596290588379, "learning_rate": 0.00019663873234888733, "loss": 0.4685, "step": 119 }, { "epoch": 0.09045936395759717, "grad_norm": 7.589311599731445, "learning_rate": 0.0001965771306776144, "loss": 0.4702, "step": 120 }, { "epoch": 0.09121319199057715, "grad_norm": 7.950814723968506, "learning_rate": 0.00019651497946257266, "loss": 0.4797, "step": 121 }, { "epoch": 0.09196702002355713, "grad_norm": 7.834803581237793, "learning_rate": 0.00019645227905741534, "loss": 0.4512, "step": 122 }, { "epoch": 0.0927208480565371, "grad_norm": 7.925727844238281, "learning_rate": 0.00019638902981892068, "loss": 0.4702, "step": 123 }, { "epoch": 0.09347467608951708, "grad_norm": 7.2047038078308105, "learning_rate": 0.00019632523210698987, "loss": 0.4586, "step": 124 }, { "epoch": 0.09422850412249706, "grad_norm": 8.701865196228027, "learning_rate": 0.00019626088628464498, "loss": 0.4629, "step": 125 }, { "epoch": 0.09498233215547704, "grad_norm": 7.792990684509277, "learning_rate": 0.00019619599271802706, "loss": 0.4578, "step": 126 }, { "epoch": 0.09573616018845701, "grad_norm": 7.0652642250061035, "learning_rate": 0.00019613055177639384, "loss": 0.4439, "step": 127 }, { "epoch": 0.09648998822143698, "grad_norm": 7.519805431365967, "learning_rate": 0.00019606456383211777, "loss": 0.4371, "step": 128 }, { "epoch": 0.09724381625441696, "grad_norm": 7.7905659675598145, "learning_rate": 0.00019599802926068384, "loss": 0.4631, "step": 129 }, { "epoch": 0.09799764428739693, "grad_norm": 7.713627338409424, "learning_rate": 0.00019593094844068748, "loss": 0.4415, "step": 130 }, { "epoch": 0.09875147232037691, "grad_norm": 7.864312171936035, "learning_rate": 0.00019586332175383238, "loss": 0.493, "step": 131 }, { "epoch": 0.09950530035335689, "grad_norm": 7.424186706542969, "learning_rate": 0.00019579514958492826, "loss": 0.4105, "step": 132 }, { "epoch": 0.10025912838633687, "grad_norm": 7.774516582489014, "learning_rate": 0.0001957264323218889, "loss": 0.4406, "step": 133 }, { "epoch": 0.10101295641931685, "grad_norm": 8.56273365020752, "learning_rate": 0.0001956571703557296, "loss": 0.4743, "step": 134 }, { "epoch": 0.10176678445229682, "grad_norm": 7.981069087982178, "learning_rate": 0.00019558736408056525, "loss": 0.4167, "step": 135 }, { "epoch": 0.1025206124852768, "grad_norm": 7.851569175720215, "learning_rate": 0.00019551701389360795, "loss": 0.4582, "step": 136 }, { "epoch": 0.10327444051825677, "grad_norm": 7.7381510734558105, "learning_rate": 0.00019544612019516472, "loss": 0.4336, "step": 137 }, { "epoch": 0.10402826855123674, "grad_norm": 8.127756118774414, "learning_rate": 0.00019537468338863537, "loss": 0.4588, "step": 138 }, { "epoch": 0.10478209658421672, "grad_norm": 7.989606857299805, "learning_rate": 0.00019530270388050998, "loss": 0.4269, "step": 139 }, { "epoch": 0.1055359246171967, "grad_norm": 8.431105613708496, "learning_rate": 0.00019523018208036677, "loss": 0.4645, "step": 140 }, { "epoch": 0.10628975265017668, "grad_norm": 8.575553894042969, "learning_rate": 0.0001951571184008698, "loss": 0.4587, "step": 141 }, { "epoch": 0.10704358068315666, "grad_norm": 9.703766822814941, "learning_rate": 0.00019508351325776642, "loss": 0.4826, "step": 142 }, { "epoch": 0.10779740871613663, "grad_norm": 10.319994926452637, "learning_rate": 0.00019500936706988502, "loss": 0.5255, "step": 143 }, { "epoch": 0.10855123674911661, "grad_norm": 11.801458358764648, "learning_rate": 0.00019493468025913276, "loss": 0.5143, "step": 144 }, { "epoch": 0.10930506478209659, "grad_norm": 11.02754020690918, "learning_rate": 0.00019485945325049288, "loss": 0.4947, "step": 145 }, { "epoch": 0.11005889281507657, "grad_norm": 11.526784896850586, "learning_rate": 0.00019478368647202264, "loss": 0.5627, "step": 146 }, { "epoch": 0.11081272084805653, "grad_norm": 11.704715728759766, "learning_rate": 0.00019470738035485058, "loss": 0.5015, "step": 147 }, { "epoch": 0.11156654888103651, "grad_norm": 14.198360443115234, "learning_rate": 0.00019463053533317425, "loss": 0.5488, "step": 148 }, { "epoch": 0.11232037691401649, "grad_norm": 14.75071907043457, "learning_rate": 0.0001945531518442576, "loss": 0.5327, "step": 149 }, { "epoch": 0.11307420494699646, "grad_norm": 17.345752716064453, "learning_rate": 0.0001944752303284287, "loss": 0.4909, "step": 150 }, { "epoch": 0.11382803297997644, "grad_norm": 25.253982543945312, "learning_rate": 0.00019439677122907697, "loss": 0.7106, "step": 151 }, { "epoch": 0.11458186101295642, "grad_norm": 22.05714988708496, "learning_rate": 0.00019431777499265087, "loss": 0.6719, "step": 152 }, { "epoch": 0.1153356890459364, "grad_norm": 14.386154174804688, "learning_rate": 0.00019423824206865527, "loss": 0.663, "step": 153 }, { "epoch": 0.11608951707891638, "grad_norm": 8.701356887817383, "learning_rate": 0.00019415817290964883, "loss": 0.5581, "step": 154 }, { "epoch": 0.11684334511189635, "grad_norm": 8.447550773620605, "learning_rate": 0.00019407756797124164, "loss": 0.5545, "step": 155 }, { "epoch": 0.11759717314487632, "grad_norm": 9.116722106933594, "learning_rate": 0.00019399642771209238, "loss": 0.5284, "step": 156 }, { "epoch": 0.1183510011778563, "grad_norm": 9.142845153808594, "learning_rate": 0.00019391475259390584, "loss": 0.5052, "step": 157 }, { "epoch": 0.11910482921083627, "grad_norm": 9.175527572631836, "learning_rate": 0.0001938325430814302, "loss": 0.524, "step": 158 }, { "epoch": 0.11985865724381625, "grad_norm": 8.684857368469238, "learning_rate": 0.00019374979964245463, "loss": 0.5387, "step": 159 }, { "epoch": 0.12061248527679623, "grad_norm": 9.40937328338623, "learning_rate": 0.00019366652274780628, "loss": 0.5081, "step": 160 }, { "epoch": 0.12136631330977621, "grad_norm": 9.983878135681152, "learning_rate": 0.00019358271287134784, "loss": 0.5234, "step": 161 }, { "epoch": 0.12212014134275619, "grad_norm": 8.468266487121582, "learning_rate": 0.00019349837048997478, "loss": 0.5008, "step": 162 }, { "epoch": 0.12287396937573616, "grad_norm": 7.315543174743652, "learning_rate": 0.00019341349608361267, "loss": 0.4778, "step": 163 }, { "epoch": 0.12362779740871614, "grad_norm": 8.254434585571289, "learning_rate": 0.00019332809013521428, "loss": 0.4949, "step": 164 }, { "epoch": 0.12438162544169612, "grad_norm": 9.409392356872559, "learning_rate": 0.00019324215313075706, "loss": 0.4842, "step": 165 }, { "epoch": 0.12513545347467608, "grad_norm": 7.584166526794434, "learning_rate": 0.00019315568555924035, "loss": 0.4859, "step": 166 }, { "epoch": 0.12588928150765608, "grad_norm": 7.280964374542236, "learning_rate": 0.0001930686879126824, "loss": 0.4436, "step": 167 }, { "epoch": 0.12664310954063604, "grad_norm": 7.54876708984375, "learning_rate": 0.0001929811606861177, "loss": 0.4636, "step": 168 }, { "epoch": 0.12739693757361603, "grad_norm": 8.36787223815918, "learning_rate": 0.00019289310437759427, "loss": 0.4862, "step": 169 }, { "epoch": 0.128150765606596, "grad_norm": 8.098321914672852, "learning_rate": 0.00019280451948817059, "loss": 0.4558, "step": 170 }, { "epoch": 0.12890459363957596, "grad_norm": 8.111252784729004, "learning_rate": 0.00019271540652191296, "loss": 0.461, "step": 171 }, { "epoch": 0.12965842167255595, "grad_norm": 7.394045829772949, "learning_rate": 0.0001926257659858925, "loss": 0.4397, "step": 172 }, { "epoch": 0.13041224970553592, "grad_norm": 7.361767768859863, "learning_rate": 0.00019253559839018235, "loss": 0.4811, "step": 173 }, { "epoch": 0.1311660777385159, "grad_norm": 7.598999500274658, "learning_rate": 0.00019244490424785468, "loss": 0.4353, "step": 174 }, { "epoch": 0.13191990577149587, "grad_norm": 7.871952056884766, "learning_rate": 0.00019235368407497788, "loss": 0.4847, "step": 175 }, { "epoch": 0.13267373380447586, "grad_norm": 7.250602722167969, "learning_rate": 0.00019226193839061347, "loss": 0.4482, "step": 176 }, { "epoch": 0.13342756183745583, "grad_norm": 7.890292644500732, "learning_rate": 0.0001921696677168133, "loss": 0.4475, "step": 177 }, { "epoch": 0.13418138987043582, "grad_norm": 7.192571640014648, "learning_rate": 0.00019207687257861655, "loss": 0.4093, "step": 178 }, { "epoch": 0.13493521790341578, "grad_norm": 8.001566886901855, "learning_rate": 0.00019198355350404667, "loss": 0.4729, "step": 179 }, { "epoch": 0.13568904593639575, "grad_norm": 7.559464454650879, "learning_rate": 0.00019188971102410837, "loss": 0.4455, "step": 180 }, { "epoch": 0.13644287396937574, "grad_norm": 7.921515941619873, "learning_rate": 0.00019179534567278475, "loss": 0.4421, "step": 181 }, { "epoch": 0.1371967020023557, "grad_norm": 7.778410911560059, "learning_rate": 0.00019170045798703406, "loss": 0.4485, "step": 182 }, { "epoch": 0.1379505300353357, "grad_norm": 7.606152534484863, "learning_rate": 0.0001916050485067868, "loss": 0.4235, "step": 183 }, { "epoch": 0.13870435806831566, "grad_norm": 7.29620361328125, "learning_rate": 0.00019150911777494258, "loss": 0.4433, "step": 184 }, { "epoch": 0.13945818610129565, "grad_norm": 7.7016072273254395, "learning_rate": 0.00019141266633736697, "loss": 0.444, "step": 185 }, { "epoch": 0.14021201413427561, "grad_norm": 7.524323463439941, "learning_rate": 0.0001913156947428886, "loss": 0.4481, "step": 186 }, { "epoch": 0.1409658421672556, "grad_norm": 7.7455525398254395, "learning_rate": 0.00019121820354329577, "loss": 0.4152, "step": 187 }, { "epoch": 0.14171967020023557, "grad_norm": 8.12897777557373, "learning_rate": 0.00019112019329333346, "loss": 0.4443, "step": 188 }, { "epoch": 0.14247349823321553, "grad_norm": 7.774250507354736, "learning_rate": 0.00019102166455070024, "loss": 0.4442, "step": 189 }, { "epoch": 0.14322732626619553, "grad_norm": 8.02647876739502, "learning_rate": 0.00019092261787604492, "loss": 0.4489, "step": 190 }, { "epoch": 0.1439811542991755, "grad_norm": 7.7611799240112305, "learning_rate": 0.00019082305383296352, "loss": 0.4122, "step": 191 }, { "epoch": 0.14473498233215548, "grad_norm": 9.484501838684082, "learning_rate": 0.00019072297298799589, "loss": 0.4725, "step": 192 }, { "epoch": 0.14548881036513545, "grad_norm": 9.696186065673828, "learning_rate": 0.00019062237591062272, "loss": 0.4913, "step": 193 }, { "epoch": 0.14624263839811544, "grad_norm": 11.048422813415527, "learning_rate": 0.00019052126317326207, "loss": 0.5425, "step": 194 }, { "epoch": 0.1469964664310954, "grad_norm": 10.327349662780762, "learning_rate": 0.00019041963535126625, "loss": 0.5171, "step": 195 }, { "epoch": 0.1477502944640754, "grad_norm": 11.808932304382324, "learning_rate": 0.0001903174930229185, "loss": 0.504, "step": 196 }, { "epoch": 0.14850412249705536, "grad_norm": 11.13871955871582, "learning_rate": 0.00019021483676942973, "loss": 0.5261, "step": 197 }, { "epoch": 0.14925795053003535, "grad_norm": 11.771498680114746, "learning_rate": 0.00019011166717493517, "loss": 0.5062, "step": 198 }, { "epoch": 0.1500117785630153, "grad_norm": 13.0664644241333, "learning_rate": 0.000190007984826491, "loss": 0.5488, "step": 199 }, { "epoch": 0.15076560659599528, "grad_norm": 15.87386417388916, "learning_rate": 0.00018990379031407124, "loss": 0.547, "step": 200 }, { "epoch": 0.15151943462897527, "grad_norm": 20.688980102539062, "learning_rate": 0.00018979908423056408, "loss": 0.7222, "step": 201 }, { "epoch": 0.15227326266195523, "grad_norm": 16.90519905090332, "learning_rate": 0.0001896938671717687, "loss": 0.6582, "step": 202 }, { "epoch": 0.15302709069493522, "grad_norm": 11.236451148986816, "learning_rate": 0.00018958813973639184, "loss": 0.6151, "step": 203 }, { "epoch": 0.1537809187279152, "grad_norm": 8.368070602416992, "learning_rate": 0.0001894819025260444, "loss": 0.5729, "step": 204 }, { "epoch": 0.15453474676089518, "grad_norm": 7.891096115112305, "learning_rate": 0.00018937515614523797, "loss": 0.5132, "step": 205 }, { "epoch": 0.15528857479387514, "grad_norm": 8.290247917175293, "learning_rate": 0.0001892679012013815, "loss": 0.5311, "step": 206 }, { "epoch": 0.15604240282685514, "grad_norm": 8.068524360656738, "learning_rate": 0.00018916013830477766, "loss": 0.5038, "step": 207 }, { "epoch": 0.1567962308598351, "grad_norm": 7.199114799499512, "learning_rate": 0.00018905186806861957, "loss": 0.4933, "step": 208 }, { "epoch": 0.15755005889281506, "grad_norm": 6.769901275634766, "learning_rate": 0.00018894309110898712, "loss": 0.4743, "step": 209 }, { "epoch": 0.15830388692579506, "grad_norm": 7.485007286071777, "learning_rate": 0.00018883380804484367, "loss": 0.4832, "step": 210 }, { "epoch": 0.15905771495877502, "grad_norm": 7.059638500213623, "learning_rate": 0.00018872401949803237, "loss": 0.4544, "step": 211 }, { "epoch": 0.159811542991755, "grad_norm": 7.6718549728393555, "learning_rate": 0.00018861372609327263, "loss": 0.4727, "step": 212 }, { "epoch": 0.16056537102473498, "grad_norm": 7.764082431793213, "learning_rate": 0.00018850292845815672, "loss": 0.4645, "step": 213 }, { "epoch": 0.16131919905771497, "grad_norm": 8.037138938903809, "learning_rate": 0.0001883916272231459, "loss": 0.4712, "step": 214 }, { "epoch": 0.16207302709069493, "grad_norm": 7.26751184463501, "learning_rate": 0.0001882798230215672, "loss": 0.4477, "step": 215 }, { "epoch": 0.16282685512367492, "grad_norm": 7.747137069702148, "learning_rate": 0.00018816751648960956, "loss": 0.4544, "step": 216 }, { "epoch": 0.1635806831566549, "grad_norm": 7.478286266326904, "learning_rate": 0.00018805470826632024, "loss": 0.4539, "step": 217 }, { "epoch": 0.16433451118963485, "grad_norm": 7.051617622375488, "learning_rate": 0.0001879413989936013, "loss": 0.4688, "step": 218 }, { "epoch": 0.16508833922261484, "grad_norm": 7.303111553192139, "learning_rate": 0.00018782758931620584, "loss": 0.4551, "step": 219 }, { "epoch": 0.1658421672555948, "grad_norm": 7.094053745269775, "learning_rate": 0.00018771327988173435, "loss": 0.4398, "step": 220 }, { "epoch": 0.1665959952885748, "grad_norm": 7.781626224517822, "learning_rate": 0.00018759847134063108, "loss": 0.4719, "step": 221 }, { "epoch": 0.16734982332155476, "grad_norm": 7.860665321350098, "learning_rate": 0.0001874831643461803, "loss": 0.4573, "step": 222 }, { "epoch": 0.16810365135453476, "grad_norm": 7.380893707275391, "learning_rate": 0.00018736735955450251, "loss": 0.4341, "step": 223 }, { "epoch": 0.16885747938751472, "grad_norm": 7.672417163848877, "learning_rate": 0.0001872510576245509, "loss": 0.4511, "step": 224 }, { "epoch": 0.1696113074204947, "grad_norm": 7.173273086547852, "learning_rate": 0.00018713425921810733, "loss": 0.4374, "step": 225 }, { "epoch": 0.17036513545347468, "grad_norm": 7.41825532913208, "learning_rate": 0.00018701696499977884, "loss": 0.4464, "step": 226 }, { "epoch": 0.17111896348645464, "grad_norm": 8.151430130004883, "learning_rate": 0.0001868991756369937, "loss": 0.4535, "step": 227 }, { "epoch": 0.17187279151943463, "grad_norm": 7.760961532592773, "learning_rate": 0.00018678089179999762, "loss": 0.4731, "step": 228 }, { "epoch": 0.1726266195524146, "grad_norm": 8.02840518951416, "learning_rate": 0.00018666211416184999, "loss": 0.4745, "step": 229 }, { "epoch": 0.1733804475853946, "grad_norm": 7.38688850402832, "learning_rate": 0.00018654284339842013, "loss": 0.4341, "step": 230 }, { "epoch": 0.17413427561837455, "grad_norm": 7.492348670959473, "learning_rate": 0.00018642308018838316, "loss": 0.4147, "step": 231 }, { "epoch": 0.17488810365135454, "grad_norm": 7.687479019165039, "learning_rate": 0.00018630282521321645, "loss": 0.4404, "step": 232 }, { "epoch": 0.1756419316843345, "grad_norm": 7.790548324584961, "learning_rate": 0.0001861820791571956, "loss": 0.4389, "step": 233 }, { "epoch": 0.1763957597173145, "grad_norm": 7.557417392730713, "learning_rate": 0.00018606084270739049, "loss": 0.4467, "step": 234 }, { "epoch": 0.17714958775029446, "grad_norm": 7.971850872039795, "learning_rate": 0.0001859391165536615, "loss": 0.415, "step": 235 }, { "epoch": 0.17790341578327443, "grad_norm": 8.08571720123291, "learning_rate": 0.0001858169013886556, "loss": 0.4488, "step": 236 }, { "epoch": 0.17865724381625442, "grad_norm": 7.706898212432861, "learning_rate": 0.00018569419790780218, "loss": 0.4296, "step": 237 }, { "epoch": 0.17941107184923438, "grad_norm": 7.6118245124816895, "learning_rate": 0.00018557100680930937, "loss": 0.4223, "step": 238 }, { "epoch": 0.18016489988221437, "grad_norm": 8.255146980285645, "learning_rate": 0.00018544732879415986, "loss": 0.4802, "step": 239 }, { "epoch": 0.18091872791519434, "grad_norm": 9.077119827270508, "learning_rate": 0.00018532316456610704, "loss": 0.4376, "step": 240 }, { "epoch": 0.18167255594817433, "grad_norm": 8.465483665466309, "learning_rate": 0.00018519851483167097, "loss": 0.4339, "step": 241 }, { "epoch": 0.1824263839811543, "grad_norm": 9.302364349365234, "learning_rate": 0.00018507338030013427, "loss": 0.4429, "step": 242 }, { "epoch": 0.18318021201413429, "grad_norm": 10.150344848632812, "learning_rate": 0.00018494776168353827, "loss": 0.4768, "step": 243 }, { "epoch": 0.18393404004711425, "grad_norm": 10.960404396057129, "learning_rate": 0.00018482165969667874, "loss": 0.5072, "step": 244 }, { "epoch": 0.18468786808009424, "grad_norm": 10.028700828552246, "learning_rate": 0.00018469507505710194, "loss": 0.5194, "step": 245 }, { "epoch": 0.1854416961130742, "grad_norm": 10.371344566345215, "learning_rate": 0.00018456800848510056, "loss": 0.4974, "step": 246 }, { "epoch": 0.18619552414605417, "grad_norm": 11.256722450256348, "learning_rate": 0.00018444046070370963, "loss": 0.4655, "step": 247 }, { "epoch": 0.18694935217903416, "grad_norm": 11.339438438415527, "learning_rate": 0.00018431243243870223, "loss": 0.5004, "step": 248 }, { "epoch": 0.18770318021201413, "grad_norm": 12.51115894317627, "learning_rate": 0.00018418392441858555, "loss": 0.5498, "step": 249 }, { "epoch": 0.18845700824499412, "grad_norm": 12.920282363891602, "learning_rate": 0.0001840549373745968, "loss": 0.4545, "step": 250 }, { "epoch": 0.18921083627797408, "grad_norm": 17.809480667114258, "learning_rate": 0.0001839254720406987, "loss": 0.6779, "step": 251 }, { "epoch": 0.18996466431095407, "grad_norm": 14.654753684997559, "learning_rate": 0.00018379552915357575, "loss": 0.639, "step": 252 }, { "epoch": 0.19071849234393404, "grad_norm": 10.703547477722168, "learning_rate": 0.00018366510945262972, "loss": 0.6024, "step": 253 }, { "epoch": 0.19147232037691403, "grad_norm": 9.329981803894043, "learning_rate": 0.00018353421367997563, "loss": 0.5221, "step": 254 }, { "epoch": 0.192226148409894, "grad_norm": 7.0998663902282715, "learning_rate": 0.00018340284258043732, "loss": 0.5203, "step": 255 }, { "epoch": 0.19297997644287396, "grad_norm": 8.919529914855957, "learning_rate": 0.00018327099690154344, "loss": 0.5286, "step": 256 }, { "epoch": 0.19373380447585395, "grad_norm": 8.378999710083008, "learning_rate": 0.00018313867739352304, "loss": 0.4929, "step": 257 }, { "epoch": 0.1944876325088339, "grad_norm": 7.437035083770752, "learning_rate": 0.00018300588480930143, "loss": 0.4622, "step": 258 }, { "epoch": 0.1952414605418139, "grad_norm": 7.368019104003906, "learning_rate": 0.0001828726199044957, "loss": 0.4824, "step": 259 }, { "epoch": 0.19599528857479387, "grad_norm": 7.174773693084717, "learning_rate": 0.0001827388834374107, "loss": 0.4601, "step": 260 }, { "epoch": 0.19674911660777386, "grad_norm": 7.612614154815674, "learning_rate": 0.0001826046761690344, "loss": 0.474, "step": 261 }, { "epoch": 0.19750294464075382, "grad_norm": 8.047442436218262, "learning_rate": 0.00018246999886303383, "loss": 0.4594, "step": 262 }, { "epoch": 0.19825677267373382, "grad_norm": 7.06972599029541, "learning_rate": 0.00018233485228575063, "loss": 0.4944, "step": 263 }, { "epoch": 0.19901060070671378, "grad_norm": 7.2451324462890625, "learning_rate": 0.00018219923720619663, "loss": 0.4748, "step": 264 }, { "epoch": 0.19976442873969374, "grad_norm": 8.119038581848145, "learning_rate": 0.0001820631543960496, "loss": 0.4286, "step": 265 }, { "epoch": 0.20051825677267374, "grad_norm": 8.046279907226562, "learning_rate": 0.0001819266046296487, "loss": 0.4566, "step": 266 }, { "epoch": 0.2012720848056537, "grad_norm": 6.79647970199585, "learning_rate": 0.00018178958868399033, "loss": 0.4214, "step": 267 }, { "epoch": 0.2020259128386337, "grad_norm": 6.761276721954346, "learning_rate": 0.00018165210733872336, "loss": 0.4272, "step": 268 }, { "epoch": 0.20277974087161366, "grad_norm": 7.771080493927002, "learning_rate": 0.000181514161376145, "loss": 0.4602, "step": 269 }, { "epoch": 0.20353356890459365, "grad_norm": 7.610669136047363, "learning_rate": 0.0001813757515811962, "loss": 0.4413, "step": 270 }, { "epoch": 0.2042873969375736, "grad_norm": 7.277632236480713, "learning_rate": 0.00018123687874145721, "loss": 0.417, "step": 271 }, { "epoch": 0.2050412249705536, "grad_norm": 7.344987869262695, "learning_rate": 0.00018109754364714305, "loss": 0.4326, "step": 272 }, { "epoch": 0.20579505300353357, "grad_norm": 7.373658180236816, "learning_rate": 0.0001809577470910992, "loss": 0.4107, "step": 273 }, { "epoch": 0.20654888103651353, "grad_norm": 8.498446464538574, "learning_rate": 0.00018081748986879679, "loss": 0.4463, "step": 274 }, { "epoch": 0.20730270906949352, "grad_norm": 7.138429164886475, "learning_rate": 0.00018067677277832834, "loss": 0.4354, "step": 275 }, { "epoch": 0.2080565371024735, "grad_norm": 7.916346073150635, "learning_rate": 0.00018053559662040302, "loss": 0.448, "step": 276 }, { "epoch": 0.20881036513545348, "grad_norm": 6.8389201164245605, "learning_rate": 0.00018039396219834237, "loss": 0.4095, "step": 277 }, { "epoch": 0.20956419316843344, "grad_norm": 7.184628009796143, "learning_rate": 0.00018025187031807532, "loss": 0.421, "step": 278 }, { "epoch": 0.21031802120141344, "grad_norm": 6.9601569175720215, "learning_rate": 0.00018010932178813397, "loss": 0.4329, "step": 279 }, { "epoch": 0.2110718492343934, "grad_norm": 7.579134464263916, "learning_rate": 0.00017996631741964888, "loss": 0.439, "step": 280 }, { "epoch": 0.2118256772673734, "grad_norm": 7.37368106842041, "learning_rate": 0.00017982285802634426, "loss": 0.4225, "step": 281 }, { "epoch": 0.21257950530035336, "grad_norm": 7.1782145500183105, "learning_rate": 0.0001796789444245337, "loss": 0.4094, "step": 282 }, { "epoch": 0.21333333333333335, "grad_norm": 7.470993995666504, "learning_rate": 0.00017953457743311523, "loss": 0.4267, "step": 283 }, { "epoch": 0.2140871613663133, "grad_norm": 7.285700798034668, "learning_rate": 0.00017938975787356673, "loss": 0.4113, "step": 284 }, { "epoch": 0.21484098939929328, "grad_norm": 7.5254130363464355, "learning_rate": 0.00017924448656994133, "loss": 0.4362, "step": 285 }, { "epoch": 0.21559481743227327, "grad_norm": 7.6265411376953125, "learning_rate": 0.00017909876434886273, "loss": 0.443, "step": 286 }, { "epoch": 0.21634864546525323, "grad_norm": 7.822786808013916, "learning_rate": 0.00017895259203952032, "loss": 0.4385, "step": 287 }, { "epoch": 0.21710247349823322, "grad_norm": 7.836915969848633, "learning_rate": 0.0001788059704736647, "loss": 0.4509, "step": 288 }, { "epoch": 0.2178563015312132, "grad_norm": 8.352907180786133, "learning_rate": 0.00017865890048560277, "loss": 0.4747, "step": 289 }, { "epoch": 0.21861012956419318, "grad_norm": 8.010136604309082, "learning_rate": 0.00017851138291219301, "loss": 0.4662, "step": 290 }, { "epoch": 0.21936395759717314, "grad_norm": 8.264348983764648, "learning_rate": 0.00017836341859284093, "loss": 0.4473, "step": 291 }, { "epoch": 0.22011778563015313, "grad_norm": 8.917752265930176, "learning_rate": 0.00017821500836949386, "loss": 0.4909, "step": 292 }, { "epoch": 0.2208716136631331, "grad_norm": 9.103057861328125, "learning_rate": 0.0001780661530866366, "loss": 0.4885, "step": 293 }, { "epoch": 0.22162544169611306, "grad_norm": 10.667252540588379, "learning_rate": 0.00017791685359128633, "loss": 0.5175, "step": 294 }, { "epoch": 0.22237926972909305, "grad_norm": 9.840495109558105, "learning_rate": 0.000177767110732988, "loss": 0.5175, "step": 295 }, { "epoch": 0.22313309776207302, "grad_norm": 10.290101051330566, "learning_rate": 0.00017761692536380928, "loss": 0.4749, "step": 296 }, { "epoch": 0.223886925795053, "grad_norm": 10.657001495361328, "learning_rate": 0.00017746629833833585, "loss": 0.534, "step": 297 }, { "epoch": 0.22464075382803297, "grad_norm": 10.042377471923828, "learning_rate": 0.00017731523051366658, "loss": 0.454, "step": 298 }, { "epoch": 0.22539458186101297, "grad_norm": 12.303505897521973, "learning_rate": 0.00017716372274940843, "loss": 0.5157, "step": 299 }, { "epoch": 0.22614840989399293, "grad_norm": 16.197650909423828, "learning_rate": 0.00017701177590767183, "loss": 0.5521, "step": 300 }, { "epoch": 0.22690223792697292, "grad_norm": 15.125090599060059, "learning_rate": 0.00017685939085306562, "loss": 0.6868, "step": 301 }, { "epoch": 0.22765606595995289, "grad_norm": 13.107701301574707, "learning_rate": 0.00017670656845269214, "loss": 0.6326, "step": 302 }, { "epoch": 0.22840989399293285, "grad_norm": 9.953380584716797, "learning_rate": 0.00017655330957614234, "loss": 0.596, "step": 303 }, { "epoch": 0.22916372202591284, "grad_norm": 7.864305019378662, "learning_rate": 0.00017639961509549078, "loss": 0.5477, "step": 304 }, { "epoch": 0.2299175500588928, "grad_norm": 6.731385707855225, "learning_rate": 0.00017624548588529072, "loss": 0.4891, "step": 305 }, { "epoch": 0.2306713780918728, "grad_norm": 6.979381084442139, "learning_rate": 0.00017609092282256912, "loss": 0.4611, "step": 306 }, { "epoch": 0.23142520612485276, "grad_norm": 8.147210121154785, "learning_rate": 0.00017593592678682166, "loss": 0.5077, "step": 307 }, { "epoch": 0.23217903415783275, "grad_norm": 7.303165435791016, "learning_rate": 0.0001757804986600077, "loss": 0.4771, "step": 308 }, { "epoch": 0.23293286219081272, "grad_norm": 7.042153358459473, "learning_rate": 0.0001756246393265453, "loss": 0.4718, "step": 309 }, { "epoch": 0.2336866902237927, "grad_norm": 7.572822570800781, "learning_rate": 0.00017546834967330617, "loss": 0.4719, "step": 310 }, { "epoch": 0.23444051825677267, "grad_norm": 7.078078269958496, "learning_rate": 0.00017531163058961066, "loss": 0.4345, "step": 311 }, { "epoch": 0.23519434628975264, "grad_norm": 7.183956623077393, "learning_rate": 0.00017515448296722262, "loss": 0.4631, "step": 312 }, { "epoch": 0.23594817432273263, "grad_norm": 7.140283584594727, "learning_rate": 0.00017499690770034443, "loss": 0.4554, "step": 313 }, { "epoch": 0.2367020023557126, "grad_norm": 7.176611423492432, "learning_rate": 0.00017483890568561173, "loss": 0.4603, "step": 314 }, { "epoch": 0.23745583038869258, "grad_norm": 6.916821002960205, "learning_rate": 0.00017468047782208865, "loss": 0.4406, "step": 315 }, { "epoch": 0.23820965842167255, "grad_norm": 7.564478874206543, "learning_rate": 0.00017452162501126227, "loss": 0.4608, "step": 316 }, { "epoch": 0.23896348645465254, "grad_norm": 7.078012466430664, "learning_rate": 0.00017436234815703788, "loss": 0.4254, "step": 317 }, { "epoch": 0.2397173144876325, "grad_norm": 7.39133358001709, "learning_rate": 0.0001742026481657335, "loss": 0.4412, "step": 318 }, { "epoch": 0.2404711425206125, "grad_norm": 7.540102005004883, "learning_rate": 0.0001740425259460751, "loss": 0.4444, "step": 319 }, { "epoch": 0.24122497055359246, "grad_norm": 7.027541160583496, "learning_rate": 0.00017388198240919102, "loss": 0.439, "step": 320 }, { "epoch": 0.24197879858657242, "grad_norm": 7.218184947967529, "learning_rate": 0.00017372101846860707, "loss": 0.4239, "step": 321 }, { "epoch": 0.24273262661955242, "grad_norm": 7.92561674118042, "learning_rate": 0.00017355963504024123, "loss": 0.4278, "step": 322 }, { "epoch": 0.24348645465253238, "grad_norm": 7.72558069229126, "learning_rate": 0.00017339783304239843, "loss": 0.4498, "step": 323 }, { "epoch": 0.24424028268551237, "grad_norm": 7.2504096031188965, "learning_rate": 0.00017323561339576543, "loss": 0.4355, "step": 324 }, { "epoch": 0.24499411071849234, "grad_norm": 7.207572937011719, "learning_rate": 0.0001730729770234054, "loss": 0.4192, "step": 325 }, { "epoch": 0.24574793875147233, "grad_norm": 7.010448455810547, "learning_rate": 0.00017290992485075282, "loss": 0.3983, "step": 326 }, { "epoch": 0.2465017667844523, "grad_norm": 7.16871452331543, "learning_rate": 0.0001727464578056081, "loss": 0.4454, "step": 327 }, { "epoch": 0.24725559481743228, "grad_norm": 7.185717582702637, "learning_rate": 0.00017258257681813244, "loss": 0.426, "step": 328 }, { "epoch": 0.24800942285041225, "grad_norm": 7.441746234893799, "learning_rate": 0.0001724182828208424, "loss": 0.4394, "step": 329 }, { "epoch": 0.24876325088339224, "grad_norm": 7.429843902587891, "learning_rate": 0.0001722535767486047, "loss": 0.4377, "step": 330 }, { "epoch": 0.2495170789163722, "grad_norm": 7.528452396392822, "learning_rate": 0.00017208845953863076, "loss": 0.4256, "step": 331 }, { "epoch": 0.25027090694935217, "grad_norm": 6.993783473968506, "learning_rate": 0.0001719229321304716, "loss": 0.4337, "step": 332 }, { "epoch": 0.25027090694935217, "eval_loss": 0.47317659854888916, "eval_runtime": 126.4401, "eval_samples_per_second": 17.676, "eval_steps_per_second": 8.842, "step": 332 }, { "epoch": 0.25102473498233213, "grad_norm": 7.080078601837158, "learning_rate": 0.00017175699546601223, "loss": 0.443, "step": 333 }, { "epoch": 0.25177856301531215, "grad_norm": 7.021576404571533, "learning_rate": 0.00017159065048946644, "loss": 0.4211, "step": 334 }, { "epoch": 0.2525323910482921, "grad_norm": 7.684916019439697, "learning_rate": 0.00017142389814737142, "loss": 0.4115, "step": 335 }, { "epoch": 0.2532862190812721, "grad_norm": 7.011744976043701, "learning_rate": 0.00017125673938858237, "loss": 0.4057, "step": 336 }, { "epoch": 0.25404004711425204, "grad_norm": 7.142672538757324, "learning_rate": 0.00017108917516426704, "loss": 0.4485, "step": 337 }, { "epoch": 0.25479387514723206, "grad_norm": 7.860468864440918, "learning_rate": 0.00017092120642790042, "loss": 0.4134, "step": 338 }, { "epoch": 0.255547703180212, "grad_norm": 8.12804889678955, "learning_rate": 0.00017075283413525916, "loss": 0.4449, "step": 339 }, { "epoch": 0.256301531213192, "grad_norm": 7.87144136428833, "learning_rate": 0.00017058405924441636, "loss": 0.3987, "step": 340 }, { "epoch": 0.25705535924617195, "grad_norm": 7.7459588050842285, "learning_rate": 0.00017041488271573587, "loss": 0.4271, "step": 341 }, { "epoch": 0.2578091872791519, "grad_norm": 8.934653282165527, "learning_rate": 0.00017024530551186702, "loss": 0.4722, "step": 342 }, { "epoch": 0.25856301531213194, "grad_norm": 8.811241149902344, "learning_rate": 0.000170075328597739, "loss": 0.4719, "step": 343 }, { "epoch": 0.2593168433451119, "grad_norm": 9.294290542602539, "learning_rate": 0.00016990495294055548, "loss": 0.4963, "step": 344 }, { "epoch": 0.26007067137809187, "grad_norm": 11.440875053405762, "learning_rate": 0.00016973417950978906, "loss": 0.5236, "step": 345 }, { "epoch": 0.26082449941107183, "grad_norm": 10.008340835571289, "learning_rate": 0.00016956300927717575, "loss": 0.5081, "step": 346 }, { "epoch": 0.26157832744405185, "grad_norm": 10.798213958740234, "learning_rate": 0.0001693914432167094, "loss": 0.5252, "step": 347 }, { "epoch": 0.2623321554770318, "grad_norm": 12.772528648376465, "learning_rate": 0.00016921948230463625, "loss": 0.5073, "step": 348 }, { "epoch": 0.2630859835100118, "grad_norm": 12.81511402130127, "learning_rate": 0.00016904712751944931, "loss": 0.4699, "step": 349 }, { "epoch": 0.26383981154299174, "grad_norm": 13.554988861083984, "learning_rate": 0.00016887437984188286, "loss": 0.4963, "step": 350 }, { "epoch": 0.2645936395759717, "grad_norm": 17.339111328125, "learning_rate": 0.00016870124025490673, "loss": 0.6331, "step": 351 }, { "epoch": 0.2653474676089517, "grad_norm": 14.55565357208252, "learning_rate": 0.0001685277097437208, "loss": 0.6053, "step": 352 }, { "epoch": 0.2661012956419317, "grad_norm": 11.207347869873047, "learning_rate": 0.0001683537892957495, "loss": 0.5787, "step": 353 }, { "epoch": 0.26685512367491165, "grad_norm": 8.820387840270996, "learning_rate": 0.00016817947990063598, "loss": 0.5605, "step": 354 }, { "epoch": 0.2676089517078916, "grad_norm": 7.382798194885254, "learning_rate": 0.0001680047825502366, "loss": 0.4917, "step": 355 }, { "epoch": 0.26836277974087164, "grad_norm": 7.330126762390137, "learning_rate": 0.00016782969823861526, "loss": 0.4976, "step": 356 }, { "epoch": 0.2691166077738516, "grad_norm": 8.046545028686523, "learning_rate": 0.0001676542279620378, "loss": 0.4864, "step": 357 }, { "epoch": 0.26987043580683157, "grad_norm": 7.838155746459961, "learning_rate": 0.00016747837271896622, "loss": 0.4797, "step": 358 }, { "epoch": 0.27062426383981153, "grad_norm": 7.075133323669434, "learning_rate": 0.00016730213351005303, "loss": 0.4655, "step": 359 }, { "epoch": 0.2713780918727915, "grad_norm": 6.840551853179932, "learning_rate": 0.00016712551133813572, "loss": 0.4453, "step": 360 }, { "epoch": 0.2721319199057715, "grad_norm": 7.175273418426514, "learning_rate": 0.0001669485072082308, "loss": 0.447, "step": 361 }, { "epoch": 0.2728857479387515, "grad_norm": 8.195796012878418, "learning_rate": 0.00016677112212752824, "loss": 0.4869, "step": 362 }, { "epoch": 0.27363957597173144, "grad_norm": 7.310915946960449, "learning_rate": 0.00016659335710538564, "loss": 0.4447, "step": 363 }, { "epoch": 0.2743934040047114, "grad_norm": 7.676048755645752, "learning_rate": 0.00016641521315332265, "loss": 0.4507, "step": 364 }, { "epoch": 0.2751472320376914, "grad_norm": 7.88531494140625, "learning_rate": 0.00016623669128501504, "loss": 0.4411, "step": 365 }, { "epoch": 0.2759010600706714, "grad_norm": 7.499680995941162, "learning_rate": 0.00016605779251628903, "loss": 0.4629, "step": 366 }, { "epoch": 0.27665488810365135, "grad_norm": 6.773830890655518, "learning_rate": 0.00016587851786511543, "loss": 0.4571, "step": 367 }, { "epoch": 0.2774087161366313, "grad_norm": 7.170431613922119, "learning_rate": 0.00016569886835160399, "loss": 0.4313, "step": 368 }, { "epoch": 0.2781625441696113, "grad_norm": 6.66681432723999, "learning_rate": 0.0001655188449979974, "loss": 0.425, "step": 369 }, { "epoch": 0.2789163722025913, "grad_norm": 6.042294025421143, "learning_rate": 0.00016533844882866568, "loss": 0.4236, "step": 370 }, { "epoch": 0.27967020023557126, "grad_norm": 6.5642924308776855, "learning_rate": 0.00016515768087010013, "loss": 0.4404, "step": 371 }, { "epoch": 0.28042402826855123, "grad_norm": 7.063207626342773, "learning_rate": 0.00016497654215090772, "loss": 0.428, "step": 372 }, { "epoch": 0.2811778563015312, "grad_norm": 6.705799579620361, "learning_rate": 0.00016479503370180507, "loss": 0.431, "step": 373 }, { "epoch": 0.2819316843345112, "grad_norm": 6.578817367553711, "learning_rate": 0.00016461315655561263, "loss": 0.4126, "step": 374 }, { "epoch": 0.2826855123674912, "grad_norm": 6.545943260192871, "learning_rate": 0.00016443091174724885, "loss": 0.4198, "step": 375 }, { "epoch": 0.28343934040047114, "grad_norm": 6.834047794342041, "learning_rate": 0.00016424830031372425, "loss": 0.4378, "step": 376 }, { "epoch": 0.2841931684334511, "grad_norm": 7.931153774261475, "learning_rate": 0.00016406532329413546, "loss": 0.4529, "step": 377 }, { "epoch": 0.28494699646643107, "grad_norm": 7.077485084533691, "learning_rate": 0.00016388198172965942, "loss": 0.4281, "step": 378 }, { "epoch": 0.2857008244994111, "grad_norm": 7.532230854034424, "learning_rate": 0.00016369827666354745, "loss": 0.4064, "step": 379 }, { "epoch": 0.28645465253239105, "grad_norm": 7.111504554748535, "learning_rate": 0.00016351420914111916, "loss": 0.4392, "step": 380 }, { "epoch": 0.287208480565371, "grad_norm": 7.107287883758545, "learning_rate": 0.0001633297802097567, "loss": 0.3896, "step": 381 }, { "epoch": 0.287962308598351, "grad_norm": 6.906205654144287, "learning_rate": 0.0001631449909188987, "loss": 0.4263, "step": 382 }, { "epoch": 0.288716136631331, "grad_norm": 7.226500034332275, "learning_rate": 0.00016295984232003426, "loss": 0.4482, "step": 383 }, { "epoch": 0.28946996466431096, "grad_norm": 6.622352123260498, "learning_rate": 0.00016277433546669703, "loss": 0.4044, "step": 384 }, { "epoch": 0.2902237926972909, "grad_norm": 7.164252281188965, "learning_rate": 0.00016258847141445928, "loss": 0.4253, "step": 385 }, { "epoch": 0.2909776207302709, "grad_norm": 7.356839656829834, "learning_rate": 0.00016240225122092573, "loss": 0.427, "step": 386 }, { "epoch": 0.29173144876325086, "grad_norm": 8.345090866088867, "learning_rate": 0.00016221567594572762, "loss": 0.4204, "step": 387 }, { "epoch": 0.2924852767962309, "grad_norm": 7.662243366241455, "learning_rate": 0.00016202874665051674, "loss": 0.393, "step": 388 }, { "epoch": 0.29323910482921084, "grad_norm": 7.708904266357422, "learning_rate": 0.00016184146439895928, "loss": 0.411, "step": 389 }, { "epoch": 0.2939929328621908, "grad_norm": 7.000946044921875, "learning_rate": 0.00016165383025672981, "loss": 0.3893, "step": 390 }, { "epoch": 0.29474676089517077, "grad_norm": 7.401767253875732, "learning_rate": 0.00016146584529150526, "loss": 0.3869, "step": 391 }, { "epoch": 0.2955005889281508, "grad_norm": 7.715709209442139, "learning_rate": 0.0001612775105729588, "loss": 0.402, "step": 392 }, { "epoch": 0.29625441696113075, "grad_norm": 8.78487491607666, "learning_rate": 0.00016108882717275384, "loss": 0.4899, "step": 393 }, { "epoch": 0.2970082449941107, "grad_norm": 9.631272315979004, "learning_rate": 0.0001608997961645377, "loss": 0.4919, "step": 394 }, { "epoch": 0.2977620730270907, "grad_norm": 9.458671569824219, "learning_rate": 0.00016071041862393578, "loss": 0.4955, "step": 395 }, { "epoch": 0.2985159010600707, "grad_norm": 10.232501029968262, "learning_rate": 0.0001605206956285454, "loss": 0.4977, "step": 396 }, { "epoch": 0.29926972909305066, "grad_norm": 9.963619232177734, "learning_rate": 0.00016033062825792935, "loss": 0.4679, "step": 397 }, { "epoch": 0.3000235571260306, "grad_norm": 12.23200798034668, "learning_rate": 0.0001601402175936102, "loss": 0.5541, "step": 398 }, { "epoch": 0.3007773851590106, "grad_norm": 11.938904762268066, "learning_rate": 0.00015994946471906382, "loss": 0.4678, "step": 399 }, { "epoch": 0.30153121319199055, "grad_norm": 14.236066818237305, "learning_rate": 0.0001597583707197134, "loss": 0.534, "step": 400 }, { "epoch": 0.3022850412249706, "grad_norm": 12.790224075317383, "learning_rate": 0.00015956693668292313, "loss": 0.6361, "step": 401 }, { "epoch": 0.30303886925795054, "grad_norm": 14.324430465698242, "learning_rate": 0.00015937516369799216, "loss": 0.6471, "step": 402 }, { "epoch": 0.3037926972909305, "grad_norm": 10.209970474243164, "learning_rate": 0.00015918305285614822, "loss": 0.5906, "step": 403 }, { "epoch": 0.30454652532391047, "grad_norm": 7.869755744934082, "learning_rate": 0.00015899060525054157, "loss": 0.5408, "step": 404 }, { "epoch": 0.3053003533568905, "grad_norm": 6.786082744598389, "learning_rate": 0.0001587978219762388, "loss": 0.5095, "step": 405 }, { "epoch": 0.30605418138987045, "grad_norm": 8.50927448272705, "learning_rate": 0.00015860470413021642, "loss": 0.5117, "step": 406 }, { "epoch": 0.3068080094228504, "grad_norm": 7.6895833015441895, "learning_rate": 0.00015841125281135473, "loss": 0.4919, "step": 407 }, { "epoch": 0.3075618374558304, "grad_norm": 7.566605567932129, "learning_rate": 0.00015821746912043165, "loss": 0.4561, "step": 408 }, { "epoch": 0.30831566548881034, "grad_norm": 7.5333452224731445, "learning_rate": 0.00015802335416011625, "loss": 0.4735, "step": 409 }, { "epoch": 0.30906949352179036, "grad_norm": 7.508667469024658, "learning_rate": 0.00015782890903496264, "loss": 0.4461, "step": 410 }, { "epoch": 0.3098233215547703, "grad_norm": 6.778057098388672, "learning_rate": 0.00015763413485140365, "loss": 0.4589, "step": 411 }, { "epoch": 0.3105771495877503, "grad_norm": 6.7967915534973145, "learning_rate": 0.00015743903271774455, "loss": 0.4438, "step": 412 }, { "epoch": 0.31133097762073025, "grad_norm": 7.60194730758667, "learning_rate": 0.0001572436037441566, "loss": 0.4371, "step": 413 }, { "epoch": 0.3120848056537103, "grad_norm": 7.298644065856934, "learning_rate": 0.00015704784904267097, "loss": 0.4678, "step": 414 }, { "epoch": 0.31283863368669024, "grad_norm": 6.711719036102295, "learning_rate": 0.00015685176972717223, "loss": 0.4511, "step": 415 }, { "epoch": 0.3135924617196702, "grad_norm": 8.647915840148926, "learning_rate": 0.00015665536691339207, "loss": 0.4697, "step": 416 }, { "epoch": 0.31434628975265017, "grad_norm": 7.388605117797852, "learning_rate": 0.00015645864171890295, "loss": 0.4322, "step": 417 }, { "epoch": 0.31510011778563013, "grad_norm": 7.3222198486328125, "learning_rate": 0.00015626159526311174, "loss": 0.4366, "step": 418 }, { "epoch": 0.31585394581861015, "grad_norm": 6.875087738037109, "learning_rate": 0.00015606422866725343, "loss": 0.4464, "step": 419 }, { "epoch": 0.3166077738515901, "grad_norm": 6.434317111968994, "learning_rate": 0.00015586654305438456, "loss": 0.4161, "step": 420 }, { "epoch": 0.3173616018845701, "grad_norm": 7.1308488845825195, "learning_rate": 0.00015566853954937694, "loss": 0.4558, "step": 421 }, { "epoch": 0.31811542991755004, "grad_norm": 7.582878112792969, "learning_rate": 0.00015547021927891144, "loss": 0.4789, "step": 422 }, { "epoch": 0.31886925795053006, "grad_norm": 6.73392391204834, "learning_rate": 0.00015527158337147112, "loss": 0.45, "step": 423 }, { "epoch": 0.31962308598351, "grad_norm": 7.364933967590332, "learning_rate": 0.00015507263295733528, "loss": 0.4156, "step": 424 }, { "epoch": 0.32037691401649, "grad_norm": 6.4493842124938965, "learning_rate": 0.00015487336916857278, "loss": 0.4147, "step": 425 }, { "epoch": 0.32113074204946995, "grad_norm": 6.886701583862305, "learning_rate": 0.00015467379313903557, "loss": 0.4271, "step": 426 }, { "epoch": 0.3218845700824499, "grad_norm": 6.938616752624512, "learning_rate": 0.00015447390600435238, "loss": 0.4356, "step": 427 }, { "epoch": 0.32263839811542994, "grad_norm": 7.1376214027404785, "learning_rate": 0.00015427370890192224, "loss": 0.411, "step": 428 }, { "epoch": 0.3233922261484099, "grad_norm": 7.260872840881348, "learning_rate": 0.00015407320297090786, "loss": 0.4505, "step": 429 }, { "epoch": 0.32414605418138986, "grad_norm": 7.035525321960449, "learning_rate": 0.00015387238935222927, "loss": 0.4032, "step": 430 }, { "epoch": 0.32489988221436983, "grad_norm": 6.7771782875061035, "learning_rate": 0.00015367126918855738, "loss": 0.4135, "step": 431 }, { "epoch": 0.32565371024734985, "grad_norm": 7.255315780639648, "learning_rate": 0.0001534698436243073, "loss": 0.4376, "step": 432 }, { "epoch": 0.3264075382803298, "grad_norm": 6.563286781311035, "learning_rate": 0.00015326811380563204, "loss": 0.3936, "step": 433 }, { "epoch": 0.3271613663133098, "grad_norm": 8.582233428955078, "learning_rate": 0.0001530660808804158, "loss": 0.3979, "step": 434 }, { "epoch": 0.32791519434628974, "grad_norm": 6.628231048583984, "learning_rate": 0.00015286374599826754, "loss": 0.4143, "step": 435 }, { "epoch": 0.3286690223792697, "grad_norm": 6.581121921539307, "learning_rate": 0.00015266111031051442, "loss": 0.4313, "step": 436 }, { "epoch": 0.3294228504122497, "grad_norm": 6.923291206359863, "learning_rate": 0.00015245817497019524, "loss": 0.3921, "step": 437 }, { "epoch": 0.3301766784452297, "grad_norm": 7.172369480133057, "learning_rate": 0.00015225494113205393, "loss": 0.4249, "step": 438 }, { "epoch": 0.33093050647820965, "grad_norm": 7.134575843811035, "learning_rate": 0.00015205140995253283, "loss": 0.4148, "step": 439 }, { "epoch": 0.3316843345111896, "grad_norm": 8.403553009033203, "learning_rate": 0.00015184758258976637, "loss": 0.447, "step": 440 }, { "epoch": 0.33243816254416964, "grad_norm": 7.707136154174805, "learning_rate": 0.00015164346020357417, "loss": 0.4165, "step": 441 }, { "epoch": 0.3331919905771496, "grad_norm": 8.08395004272461, "learning_rate": 0.00015143904395545466, "loss": 0.461, "step": 442 }, { "epoch": 0.33394581861012956, "grad_norm": 9.609329223632812, "learning_rate": 0.0001512343350085784, "loss": 0.5137, "step": 443 }, { "epoch": 0.3346996466431095, "grad_norm": 9.876978874206543, "learning_rate": 0.0001510293345277815, "loss": 0.5053, "step": 444 }, { "epoch": 0.3354534746760895, "grad_norm": 9.40042495727539, "learning_rate": 0.0001508240436795589, "loss": 0.5114, "step": 445 }, { "epoch": 0.3362073027090695, "grad_norm": 10.623950958251953, "learning_rate": 0.00015061846363205784, "loss": 0.497, "step": 446 }, { "epoch": 0.3369611307420495, "grad_norm": 10.993450164794922, "learning_rate": 0.00015041259555507108, "loss": 0.49, "step": 447 }, { "epoch": 0.33771495877502944, "grad_norm": 11.963092803955078, "learning_rate": 0.00015020644062003046, "loss": 0.5261, "step": 448 }, { "epoch": 0.3384687868080094, "grad_norm": 11.985857963562012, "learning_rate": 0.00015000000000000001, "loss": 0.5063, "step": 449 }, { "epoch": 0.3392226148409894, "grad_norm": 13.582792282104492, "learning_rate": 0.00014979327486966938, "loss": 0.4568, "step": 450 }, { "epoch": 0.3399764428739694, "grad_norm": 10.956193923950195, "learning_rate": 0.0001495862664053471, "loss": 0.6271, "step": 451 }, { "epoch": 0.34073027090694935, "grad_norm": 10.826944351196289, "learning_rate": 0.0001493789757849541, "loss": 0.5646, "step": 452 }, { "epoch": 0.3414840989399293, "grad_norm": 9.086105346679688, "learning_rate": 0.00014917140418801655, "loss": 0.5347, "step": 453 }, { "epoch": 0.3422379269729093, "grad_norm": 7.542895317077637, "learning_rate": 0.00014896355279565976, "loss": 0.547, "step": 454 }, { "epoch": 0.3429917550058893, "grad_norm": 6.925205707550049, "learning_rate": 0.00014875542279060085, "loss": 0.5174, "step": 455 }, { "epoch": 0.34374558303886926, "grad_norm": 6.2740159034729, "learning_rate": 0.00014854701535714244, "loss": 0.4569, "step": 456 }, { "epoch": 0.3444994110718492, "grad_norm": 6.751154899597168, "learning_rate": 0.00014833833168116582, "loss": 0.4848, "step": 457 }, { "epoch": 0.3452532391048292, "grad_norm": 6.805966854095459, "learning_rate": 0.00014812937295012406, "loss": 0.454, "step": 458 }, { "epoch": 0.3460070671378092, "grad_norm": 6.805473327636719, "learning_rate": 0.00014792014035303535, "loss": 0.4459, "step": 459 }, { "epoch": 0.3467608951707892, "grad_norm": 6.896597385406494, "learning_rate": 0.00014771063508047636, "loss": 0.4492, "step": 460 }, { "epoch": 0.34751472320376914, "grad_norm": 6.992384433746338, "learning_rate": 0.00014750085832457519, "loss": 0.4737, "step": 461 }, { "epoch": 0.3482685512367491, "grad_norm": 7.02846622467041, "learning_rate": 0.00014729081127900476, "loss": 0.4786, "step": 462 }, { "epoch": 0.34902237926972907, "grad_norm": 7.123291015625, "learning_rate": 0.0001470804951389761, "loss": 0.4397, "step": 463 }, { "epoch": 0.3497762073027091, "grad_norm": 6.681251049041748, "learning_rate": 0.00014686991110123135, "loss": 0.4398, "step": 464 }, { "epoch": 0.35053003533568905, "grad_norm": 7.414073944091797, "learning_rate": 0.00014665906036403706, "loss": 0.4626, "step": 465 }, { "epoch": 0.351283863368669, "grad_norm": 6.917845726013184, "learning_rate": 0.00014644794412717736, "loss": 0.4312, "step": 466 }, { "epoch": 0.352037691401649, "grad_norm": 6.451867580413818, "learning_rate": 0.00014623656359194712, "loss": 0.4101, "step": 467 }, { "epoch": 0.352791519434629, "grad_norm": 7.152139663696289, "learning_rate": 0.00014602491996114516, "loss": 0.4518, "step": 468 }, { "epoch": 0.35354534746760896, "grad_norm": 7.701825141906738, "learning_rate": 0.0001458130144390673, "loss": 0.4568, "step": 469 }, { "epoch": 0.3542991755005889, "grad_norm": 7.278562545776367, "learning_rate": 0.00014560084823149965, "loss": 0.4222, "step": 470 }, { "epoch": 0.3550530035335689, "grad_norm": 6.47285270690918, "learning_rate": 0.0001453884225457116, "loss": 0.465, "step": 471 }, { "epoch": 0.35580683156654885, "grad_norm": 6.140552520751953, "learning_rate": 0.00014517573859044907, "loss": 0.4219, "step": 472 }, { "epoch": 0.3565606595995289, "grad_norm": 6.481984615325928, "learning_rate": 0.00014496279757592766, "loss": 0.4446, "step": 473 }, { "epoch": 0.35731448763250884, "grad_norm": 6.575818061828613, "learning_rate": 0.0001447496007138255, "loss": 0.4297, "step": 474 }, { "epoch": 0.3580683156654888, "grad_norm": 6.637454509735107, "learning_rate": 0.00014453614921727668, "loss": 0.4311, "step": 475 }, { "epoch": 0.35882214369846877, "grad_norm": 6.832921981811523, "learning_rate": 0.00014432244430086423, "loss": 0.4469, "step": 476 }, { "epoch": 0.3595759717314488, "grad_norm": 7.260216236114502, "learning_rate": 0.00014410848718061312, "loss": 0.4206, "step": 477 }, { "epoch": 0.36032979976442875, "grad_norm": 6.812548637390137, "learning_rate": 0.00014389427907398342, "loss": 0.4146, "step": 478 }, { "epoch": 0.3610836277974087, "grad_norm": 6.668044090270996, "learning_rate": 0.00014367982119986342, "loss": 0.4333, "step": 479 }, { "epoch": 0.3618374558303887, "grad_norm": 7.100220680236816, "learning_rate": 0.00014346511477856259, "loss": 0.4174, "step": 480 }, { "epoch": 0.3625912838633687, "grad_norm": 7.15718936920166, "learning_rate": 0.0001432501610318047, "loss": 0.4258, "step": 481 }, { "epoch": 0.36334511189634866, "grad_norm": 7.051331520080566, "learning_rate": 0.00014303496118272084, "loss": 0.4048, "step": 482 }, { "epoch": 0.3640989399293286, "grad_norm": 7.344452381134033, "learning_rate": 0.0001428195164558425, "loss": 0.4137, "step": 483 }, { "epoch": 0.3648527679623086, "grad_norm": 7.5303850173950195, "learning_rate": 0.00014260382807709457, "loss": 0.421, "step": 484 }, { "epoch": 0.36560659599528855, "grad_norm": 6.944647789001465, "learning_rate": 0.0001423878972737883, "loss": 0.4059, "step": 485 }, { "epoch": 0.36636042402826857, "grad_norm": 7.10966682434082, "learning_rate": 0.0001421717252746145, "loss": 0.4038, "step": 486 }, { "epoch": 0.36711425206124854, "grad_norm": 6.702695369720459, "learning_rate": 0.00014195531330963635, "loss": 0.3999, "step": 487 }, { "epoch": 0.3678680800942285, "grad_norm": 8.255915641784668, "learning_rate": 0.0001417386626102825, "loss": 0.3961, "step": 488 }, { "epoch": 0.36862190812720846, "grad_norm": 8.199605941772461, "learning_rate": 0.00014152177440934012, "loss": 0.4079, "step": 489 }, { "epoch": 0.3693757361601885, "grad_norm": 7.717386245727539, "learning_rate": 0.0001413046499409477, "loss": 0.3932, "step": 490 }, { "epoch": 0.37012956419316845, "grad_norm": 7.842260837554932, "learning_rate": 0.0001410872904405882, "loss": 0.4383, "step": 491 }, { "epoch": 0.3708833922261484, "grad_norm": 8.819681167602539, "learning_rate": 0.00014086969714508196, "loss": 0.4763, "step": 492 }, { "epoch": 0.3716372202591284, "grad_norm": 8.904485702514648, "learning_rate": 0.00014065187129257964, "loss": 0.4711, "step": 493 }, { "epoch": 0.37239104829210834, "grad_norm": 9.481599807739258, "learning_rate": 0.00014043381412255526, "loss": 0.5002, "step": 494 }, { "epoch": 0.37314487632508836, "grad_norm": 9.55698013305664, "learning_rate": 0.00014021552687579902, "loss": 0.454, "step": 495 }, { "epoch": 0.3738987043580683, "grad_norm": 9.685362815856934, "learning_rate": 0.00013999701079441028, "loss": 0.4687, "step": 496 }, { "epoch": 0.3746525323910483, "grad_norm": 10.087312698364258, "learning_rate": 0.00013977826712179058, "loss": 0.4855, "step": 497 }, { "epoch": 0.37540636042402825, "grad_norm": 10.978914260864258, "learning_rate": 0.00013955929710263653, "loss": 0.485, "step": 498 }, { "epoch": 0.37616018845700827, "grad_norm": 11.427350044250488, "learning_rate": 0.00013934010198293257, "loss": 0.4536, "step": 499 }, { "epoch": 0.37691401648998824, "grad_norm": 12.61874771118164, "learning_rate": 0.00013912068300994413, "loss": 0.4844, "step": 500 }, { "epoch": 0.3776678445229682, "grad_norm": 11.156290054321289, "learning_rate": 0.0001389010414322104, "loss": 0.6025, "step": 501 }, { "epoch": 0.37842167255594816, "grad_norm": 10.892552375793457, "learning_rate": 0.0001386811784995371, "loss": 0.6063, "step": 502 }, { "epoch": 0.3791755005889281, "grad_norm": 9.48608112335205, "learning_rate": 0.00013846109546298971, "loss": 0.5153, "step": 503 }, { "epoch": 0.37992932862190815, "grad_norm": 7.735827922821045, "learning_rate": 0.00013824079357488598, "loss": 0.5102, "step": 504 }, { "epoch": 0.3806831566548881, "grad_norm": 6.837904453277588, "learning_rate": 0.0001380202740887891, "loss": 0.4952, "step": 505 }, { "epoch": 0.3814369846878681, "grad_norm": 6.260585308074951, "learning_rate": 0.00013779953825950034, "loss": 0.4751, "step": 506 }, { "epoch": 0.38219081272084804, "grad_norm": 6.398446083068848, "learning_rate": 0.00013757858734305203, "loss": 0.4449, "step": 507 }, { "epoch": 0.38294464075382806, "grad_norm": 7.3623881340026855, "learning_rate": 0.0001373574225967004, "loss": 0.4859, "step": 508 }, { "epoch": 0.383698468786808, "grad_norm": 7.673310279846191, "learning_rate": 0.00013713604527891844, "loss": 0.4804, "step": 509 }, { "epoch": 0.384452296819788, "grad_norm": 6.531475067138672, "learning_rate": 0.00013691445664938866, "loss": 0.4491, "step": 510 }, { "epoch": 0.38520612485276795, "grad_norm": 6.5302300453186035, "learning_rate": 0.00013669265796899607, "loss": 0.4277, "step": 511 }, { "epoch": 0.3859599528857479, "grad_norm": 6.498359680175781, "learning_rate": 0.00013647065049982078, "loss": 0.4473, "step": 512 }, { "epoch": 0.38671378091872793, "grad_norm": 7.777768135070801, "learning_rate": 0.0001362484355051311, "loss": 0.4485, "step": 513 }, { "epoch": 0.3874676089517079, "grad_norm": 6.4952192306518555, "learning_rate": 0.00013602601424937604, "loss": 0.4144, "step": 514 }, { "epoch": 0.38822143698468786, "grad_norm": 7.111438274383545, "learning_rate": 0.00013580338799817844, "loss": 0.4314, "step": 515 }, { "epoch": 0.3889752650176678, "grad_norm": 6.711978435516357, "learning_rate": 0.00013558055801832748, "loss": 0.4476, "step": 516 }, { "epoch": 0.38972909305064785, "grad_norm": 6.2299370765686035, "learning_rate": 0.0001353575255777717, "loss": 0.4211, "step": 517 }, { "epoch": 0.3904829210836278, "grad_norm": 6.2404046058654785, "learning_rate": 0.0001351342919456116, "loss": 0.4195, "step": 518 }, { "epoch": 0.3912367491166078, "grad_norm": 7.3141679763793945, "learning_rate": 0.0001349108583920925, "loss": 0.4473, "step": 519 }, { "epoch": 0.39199057714958774, "grad_norm": 7.678971767425537, "learning_rate": 0.00013468722618859743, "loss": 0.4102, "step": 520 }, { "epoch": 0.3927444051825677, "grad_norm": 6.773143291473389, "learning_rate": 0.0001344633966076396, "loss": 0.4518, "step": 521 }, { "epoch": 0.3934982332155477, "grad_norm": 6.161088943481445, "learning_rate": 0.00013423937092285555, "loss": 0.4, "step": 522 }, { "epoch": 0.3942520612485277, "grad_norm": 6.478328227996826, "learning_rate": 0.00013401515040899746, "loss": 0.4607, "step": 523 }, { "epoch": 0.39500588928150765, "grad_norm": 6.1380157470703125, "learning_rate": 0.00013379073634192632, "loss": 0.4108, "step": 524 }, { "epoch": 0.3957597173144876, "grad_norm": 6.8945441246032715, "learning_rate": 0.00013356612999860436, "loss": 0.4032, "step": 525 }, { "epoch": 0.39651354534746763, "grad_norm": 6.745527267456055, "learning_rate": 0.000133341332657088, "loss": 0.402, "step": 526 }, { "epoch": 0.3972673733804476, "grad_norm": 6.959543704986572, "learning_rate": 0.00013311634559652036, "loss": 0.4258, "step": 527 }, { "epoch": 0.39802120141342756, "grad_norm": 6.5237298011779785, "learning_rate": 0.00013289117009712418, "loss": 0.4042, "step": 528 }, { "epoch": 0.3987750294464075, "grad_norm": 6.997231483459473, "learning_rate": 0.00013266580744019445, "loss": 0.424, "step": 529 }, { "epoch": 0.3995288574793875, "grad_norm": 7.053787708282471, "learning_rate": 0.00013244025890809112, "loss": 0.4436, "step": 530 }, { "epoch": 0.4002826855123675, "grad_norm": 6.5921831130981445, "learning_rate": 0.00013221452578423176, "loss": 0.4262, "step": 531 }, { "epoch": 0.4010365135453475, "grad_norm": 7.524543285369873, "learning_rate": 0.00013198860935308444, "loss": 0.4205, "step": 532 }, { "epoch": 0.40179034157832744, "grad_norm": 6.691077709197998, "learning_rate": 0.00013176251090016007, "loss": 0.4303, "step": 533 }, { "epoch": 0.4025441696113074, "grad_norm": 6.8649749755859375, "learning_rate": 0.0001315362317120055, "loss": 0.4293, "step": 534 }, { "epoch": 0.4032979976442874, "grad_norm": 7.226325035095215, "learning_rate": 0.00013130977307619594, "loss": 0.4118, "step": 535 }, { "epoch": 0.4040518256772674, "grad_norm": 6.9132843017578125, "learning_rate": 0.0001310831362813276, "loss": 0.4086, "step": 536 }, { "epoch": 0.40480565371024735, "grad_norm": 6.638665199279785, "learning_rate": 0.00013085632261701063, "loss": 0.404, "step": 537 }, { "epoch": 0.4055594817432273, "grad_norm": 6.809209823608398, "learning_rate": 0.00013062933337386142, "loss": 0.378, "step": 538 }, { "epoch": 0.4063133097762073, "grad_norm": 6.697812557220459, "learning_rate": 0.00013040216984349555, "loss": 0.4068, "step": 539 }, { "epoch": 0.4070671378091873, "grad_norm": 7.231639862060547, "learning_rate": 0.00013017483331852035, "loss": 0.4167, "step": 540 }, { "epoch": 0.40782096584216726, "grad_norm": 7.607770919799805, "learning_rate": 0.00012994732509252744, "loss": 0.4298, "step": 541 }, { "epoch": 0.4085747938751472, "grad_norm": 7.685420989990234, "learning_rate": 0.00012971964646008542, "loss": 0.4435, "step": 542 }, { "epoch": 0.4093286219081272, "grad_norm": 9.00213623046875, "learning_rate": 0.00012949179871673278, "loss": 0.5072, "step": 543 }, { "epoch": 0.4100824499411072, "grad_norm": 9.699268341064453, "learning_rate": 0.00012926378315896998, "loss": 0.5158, "step": 544 }, { "epoch": 0.41083627797408717, "grad_norm": 10.096549987792969, "learning_rate": 0.00012903560108425258, "loss": 0.479, "step": 545 }, { "epoch": 0.41159010600706714, "grad_norm": 9.205822944641113, "learning_rate": 0.00012880725379098352, "loss": 0.4844, "step": 546 }, { "epoch": 0.4123439340400471, "grad_norm": 10.534090995788574, "learning_rate": 0.00012857874257850605, "loss": 0.4998, "step": 547 }, { "epoch": 0.41309776207302706, "grad_norm": 11.49348258972168, "learning_rate": 0.00012835006874709594, "loss": 0.4969, "step": 548 }, { "epoch": 0.4138515901060071, "grad_norm": 11.891164779663086, "learning_rate": 0.00012812123359795446, "loss": 0.5109, "step": 549 }, { "epoch": 0.41460541813898705, "grad_norm": 12.372316360473633, "learning_rate": 0.00012789223843320073, "loss": 0.4808, "step": 550 }, { "epoch": 0.415359246171967, "grad_norm": 9.265199661254883, "learning_rate": 0.0001276630845558644, "loss": 0.6065, "step": 551 }, { "epoch": 0.416113074204947, "grad_norm": 10.428581237792969, "learning_rate": 0.00012743377326987826, "loss": 0.5885, "step": 552 }, { "epoch": 0.416866902237927, "grad_norm": 8.8326997756958, "learning_rate": 0.00012720430588007077, "loss": 0.5599, "step": 553 }, { "epoch": 0.41762073027090696, "grad_norm": 6.87199592590332, "learning_rate": 0.00012697468369215863, "loss": 0.5212, "step": 554 }, { "epoch": 0.4183745583038869, "grad_norm": 6.59550142288208, "learning_rate": 0.00012674490801273938, "loss": 0.5265, "step": 555 }, { "epoch": 0.4191283863368669, "grad_norm": 5.809760093688965, "learning_rate": 0.00012651498014928402, "loss": 0.4861, "step": 556 }, { "epoch": 0.41988221436984685, "grad_norm": 5.872656345367432, "learning_rate": 0.00012628490141012937, "loss": 0.4476, "step": 557 }, { "epoch": 0.42063604240282687, "grad_norm": 6.835720062255859, "learning_rate": 0.000126054673104471, "loss": 0.4838, "step": 558 }, { "epoch": 0.42138987043580683, "grad_norm": 6.669496059417725, "learning_rate": 0.00012582429654235523, "loss": 0.4167, "step": 559 }, { "epoch": 0.4221436984687868, "grad_norm": 6.77216100692749, "learning_rate": 0.00012559377303467226, "loss": 0.4469, "step": 560 }, { "epoch": 0.42289752650176676, "grad_norm": 6.118035793304443, "learning_rate": 0.00012536310389314832, "loss": 0.439, "step": 561 }, { "epoch": 0.4236513545347468, "grad_norm": 6.0063886642456055, "learning_rate": 0.0001251322904303383, "loss": 0.4246, "step": 562 }, { "epoch": 0.42440518256772675, "grad_norm": 6.384454727172852, "learning_rate": 0.00012490133395961844, "loss": 0.4427, "step": 563 }, { "epoch": 0.4251590106007067, "grad_norm": 6.875798225402832, "learning_rate": 0.00012467023579517856, "loss": 0.4746, "step": 564 }, { "epoch": 0.4259128386336867, "grad_norm": 6.876395225524902, "learning_rate": 0.00012443899725201482, "loss": 0.4639, "step": 565 }, { "epoch": 0.4266666666666667, "grad_norm": 7.060841083526611, "learning_rate": 0.00012420761964592223, "loss": 0.4449, "step": 566 }, { "epoch": 0.42742049469964666, "grad_norm": 6.859095573425293, "learning_rate": 0.000123976104293487, "loss": 0.4127, "step": 567 }, { "epoch": 0.4281743227326266, "grad_norm": 6.3295135498046875, "learning_rate": 0.00012374445251207914, "loss": 0.4436, "step": 568 }, { "epoch": 0.4289281507656066, "grad_norm": 6.203479766845703, "learning_rate": 0.00012351266561984507, "loss": 0.4493, "step": 569 }, { "epoch": 0.42968197879858655, "grad_norm": 6.393275737762451, "learning_rate": 0.00012328074493569993, "loss": 0.451, "step": 570 }, { "epoch": 0.43043580683156657, "grad_norm": 6.78492546081543, "learning_rate": 0.0001230486917793202, "loss": 0.4278, "step": 571 }, { "epoch": 0.43118963486454653, "grad_norm": 6.327200889587402, "learning_rate": 0.00012281650747113612, "loss": 0.4422, "step": 572 }, { "epoch": 0.4319434628975265, "grad_norm": 6.7098822593688965, "learning_rate": 0.0001225841933323242, "loss": 0.4556, "step": 573 }, { "epoch": 0.43269729093050646, "grad_norm": 6.249898910522461, "learning_rate": 0.00012235175068479984, "loss": 0.4184, "step": 574 }, { "epoch": 0.4334511189634865, "grad_norm": 6.380219459533691, "learning_rate": 0.00012211918085120954, "loss": 0.437, "step": 575 }, { "epoch": 0.43420494699646645, "grad_norm": 6.367920875549316, "learning_rate": 0.00012188648515492355, "loss": 0.4269, "step": 576 }, { "epoch": 0.4349587750294464, "grad_norm": 6.438598155975342, "learning_rate": 0.00012165366492002832, "loss": 0.4298, "step": 577 }, { "epoch": 0.4357126030624264, "grad_norm": 6.798791408538818, "learning_rate": 0.00012142072147131898, "loss": 0.4204, "step": 578 }, { "epoch": 0.43646643109540634, "grad_norm": 6.528103828430176, "learning_rate": 0.00012118765613429173, "loss": 0.4448, "step": 579 }, { "epoch": 0.43722025912838636, "grad_norm": 6.5673909187316895, "learning_rate": 0.0001209544702351363, "loss": 0.432, "step": 580 }, { "epoch": 0.4379740871613663, "grad_norm": 7.303831577301025, "learning_rate": 0.00012072116510072858, "loss": 0.4125, "step": 581 }, { "epoch": 0.4387279151943463, "grad_norm": 6.5421576499938965, "learning_rate": 0.00012048774205862279, "loss": 0.4171, "step": 582 }, { "epoch": 0.43948174322732625, "grad_norm": 6.537741661071777, "learning_rate": 0.0001202542024370441, "loss": 0.385, "step": 583 }, { "epoch": 0.44023557126030627, "grad_norm": 6.6051530838012695, "learning_rate": 0.00012002054756488115, "loss": 0.3888, "step": 584 }, { "epoch": 0.44098939929328623, "grad_norm": 6.796999454498291, "learning_rate": 0.00011978677877167822, "loss": 0.4049, "step": 585 }, { "epoch": 0.4417432273262662, "grad_norm": 7.154036521911621, "learning_rate": 0.00011955289738762796, "loss": 0.4168, "step": 586 }, { "epoch": 0.44249705535924616, "grad_norm": 6.852260112762451, "learning_rate": 0.00011931890474356358, "loss": 0.381, "step": 587 }, { "epoch": 0.4432508833922261, "grad_norm": 6.91892671585083, "learning_rate": 0.00011908480217095141, "loss": 0.3895, "step": 588 }, { "epoch": 0.44400471142520614, "grad_norm": 7.690057277679443, "learning_rate": 0.00011885059100188341, "loss": 0.4504, "step": 589 }, { "epoch": 0.4447585394581861, "grad_norm": 7.000772476196289, "learning_rate": 0.00011861627256906929, "loss": 0.3868, "step": 590 }, { "epoch": 0.4455123674911661, "grad_norm": 7.221988201141357, "learning_rate": 0.00011838184820582923, "loss": 0.4119, "step": 591 }, { "epoch": 0.44626619552414604, "grad_norm": 8.583606719970703, "learning_rate": 0.00011814731924608616, "loss": 0.4087, "step": 592 }, { "epoch": 0.44702002355712606, "grad_norm": 8.559534072875977, "learning_rate": 0.00011791268702435816, "loss": 0.4469, "step": 593 }, { "epoch": 0.447773851590106, "grad_norm": 8.477254867553711, "learning_rate": 0.0001176779528757509, "loss": 0.476, "step": 594 }, { "epoch": 0.448527679623086, "grad_norm": 9.82533073425293, "learning_rate": 0.00011744311813595006, "loss": 0.5395, "step": 595 }, { "epoch": 0.44928150765606595, "grad_norm": 9.407917022705078, "learning_rate": 0.00011720818414121368, "loss": 0.4716, "step": 596 }, { "epoch": 0.4500353356890459, "grad_norm": 11.39129638671875, "learning_rate": 0.00011697315222836458, "loss": 0.4827, "step": 597 }, { "epoch": 0.45078916372202593, "grad_norm": 11.540337562561035, "learning_rate": 0.0001167380237347828, "loss": 0.4713, "step": 598 }, { "epoch": 0.4515429917550059, "grad_norm": 10.345648765563965, "learning_rate": 0.00011650279999839787, "loss": 0.4148, "step": 599 }, { "epoch": 0.45229681978798586, "grad_norm": 12.826940536499023, "learning_rate": 0.00011626748235768128, "loss": 0.487, "step": 600 }, { "epoch": 0.4530506478209658, "grad_norm": 9.553250312805176, "learning_rate": 0.00011603207215163894, "loss": 0.5809, "step": 601 }, { "epoch": 0.45380447585394584, "grad_norm": 9.77419662475586, "learning_rate": 0.0001157965707198034, "loss": 0.5538, "step": 602 }, { "epoch": 0.4545583038869258, "grad_norm": 8.743382453918457, "learning_rate": 0.00011556097940222628, "loss": 0.5516, "step": 603 }, { "epoch": 0.45531213191990577, "grad_norm": 7.538958549499512, "learning_rate": 0.00011532529953947075, "loss": 0.5119, "step": 604 }, { "epoch": 0.45606595995288574, "grad_norm": 6.539525032043457, "learning_rate": 0.00011508953247260379, "loss": 0.499, "step": 605 }, { "epoch": 0.4568197879858657, "grad_norm": 6.682277679443359, "learning_rate": 0.00011485367954318856, "loss": 0.4594, "step": 606 }, { "epoch": 0.4575736160188457, "grad_norm": 5.594506740570068, "learning_rate": 0.0001146177420932768, "loss": 0.4609, "step": 607 }, { "epoch": 0.4583274440518257, "grad_norm": 6.195127964019775, "learning_rate": 0.00011438172146540123, "loss": 0.4413, "step": 608 }, { "epoch": 0.45908127208480565, "grad_norm": 6.665927410125732, "learning_rate": 0.00011414561900256784, "loss": 0.4492, "step": 609 }, { "epoch": 0.4598351001177856, "grad_norm": 7.045360088348389, "learning_rate": 0.00011390943604824826, "loss": 0.4508, "step": 610 }, { "epoch": 0.46058892815076563, "grad_norm": 7.470615386962891, "learning_rate": 0.00011367317394637218, "loss": 0.46, "step": 611 }, { "epoch": 0.4613427561837456, "grad_norm": 6.948364734649658, "learning_rate": 0.00011343683404131964, "loss": 0.477, "step": 612 }, { "epoch": 0.46209658421672556, "grad_norm": 6.797374248504639, "learning_rate": 0.00011320041767791336, "loss": 0.4726, "step": 613 }, { "epoch": 0.4628504122497055, "grad_norm": 6.488336563110352, "learning_rate": 0.00011296392620141114, "loss": 0.4403, "step": 614 }, { "epoch": 0.4636042402826855, "grad_norm": 7.050676345825195, "learning_rate": 0.00011272736095749823, "loss": 0.475, "step": 615 }, { "epoch": 0.4643580683156655, "grad_norm": 6.4435038566589355, "learning_rate": 0.00011249072329227959, "loss": 0.4188, "step": 616 }, { "epoch": 0.46511189634864547, "grad_norm": 6.662125110626221, "learning_rate": 0.0001122540145522723, "loss": 0.4365, "step": 617 }, { "epoch": 0.46586572438162543, "grad_norm": 6.387564659118652, "learning_rate": 0.00011201723608439778, "loss": 0.4237, "step": 618 }, { "epoch": 0.4666195524146054, "grad_norm": 6.151999473571777, "learning_rate": 0.0001117803892359744, "loss": 0.3967, "step": 619 }, { "epoch": 0.4673733804475854, "grad_norm": 6.0764055252075195, "learning_rate": 0.00011154347535470947, "loss": 0.4032, "step": 620 }, { "epoch": 0.4681272084805654, "grad_norm": 6.069274425506592, "learning_rate": 0.00011130649578869173, "loss": 0.4234, "step": 621 }, { "epoch": 0.46888103651354535, "grad_norm": 6.283833980560303, "learning_rate": 0.00011106945188638378, "loss": 0.4115, "step": 622 }, { "epoch": 0.4696348645465253, "grad_norm": 6.327964782714844, "learning_rate": 0.00011083234499661426, "loss": 0.4293, "step": 623 }, { "epoch": 0.4703886925795053, "grad_norm": 6.516750812530518, "learning_rate": 0.00011059517646857023, "loss": 0.3893, "step": 624 }, { "epoch": 0.4711425206124853, "grad_norm": 7.370739936828613, "learning_rate": 0.00011035794765178941, "loss": 0.4385, "step": 625 }, { "epoch": 0.47189634864546526, "grad_norm": 7.1700568199157715, "learning_rate": 0.0001101206598961527, "loss": 0.4221, "step": 626 }, { "epoch": 0.4726501766784452, "grad_norm": 6.261050701141357, "learning_rate": 0.00010988331455187628, "loss": 0.4389, "step": 627 }, { "epoch": 0.4734040047114252, "grad_norm": 6.810924530029297, "learning_rate": 0.00010964591296950406, "loss": 0.4653, "step": 628 }, { "epoch": 0.4741578327444052, "grad_norm": 6.419404983520508, "learning_rate": 0.00010940845649989994, "loss": 0.4074, "step": 629 }, { "epoch": 0.47491166077738517, "grad_norm": 6.0266008377075195, "learning_rate": 0.00010917094649424018, "loss": 0.3729, "step": 630 }, { "epoch": 0.47566548881036513, "grad_norm": 6.674122333526611, "learning_rate": 0.00010893338430400562, "loss": 0.4016, "step": 631 }, { "epoch": 0.4764193168433451, "grad_norm": 6.93697452545166, "learning_rate": 0.00010869577128097404, "loss": 0.3884, "step": 632 }, { "epoch": 0.47717314487632506, "grad_norm": 6.370805263519287, "learning_rate": 0.00010845810877721252, "loss": 0.3835, "step": 633 }, { "epoch": 0.4779269729093051, "grad_norm": 6.402405738830566, "learning_rate": 0.00010822039814506964, "loss": 0.396, "step": 634 }, { "epoch": 0.47868080094228505, "grad_norm": 6.631165027618408, "learning_rate": 0.00010798264073716791, "loss": 0.4034, "step": 635 }, { "epoch": 0.479434628975265, "grad_norm": 7.069218635559082, "learning_rate": 0.00010774483790639591, "loss": 0.4071, "step": 636 }, { "epoch": 0.480188457008245, "grad_norm": 6.614718914031982, "learning_rate": 0.00010750699100590076, "loss": 0.3959, "step": 637 }, { "epoch": 0.480942285041225, "grad_norm": 6.693352699279785, "learning_rate": 0.00010726910138908032, "loss": 0.3853, "step": 638 }, { "epoch": 0.48169611307420496, "grad_norm": 6.8856940269470215, "learning_rate": 0.00010703117040957553, "loss": 0.3904, "step": 639 }, { "epoch": 0.4824499411071849, "grad_norm": 7.3366522789001465, "learning_rate": 0.00010679319942126264, "loss": 0.4061, "step": 640 }, { "epoch": 0.4832037691401649, "grad_norm": 7.205180644989014, "learning_rate": 0.00010655518977824566, "loss": 0.4066, "step": 641 }, { "epoch": 0.48395759717314485, "grad_norm": 9.314166069030762, "learning_rate": 0.00010631714283484842, "loss": 0.4507, "step": 642 }, { "epoch": 0.48471142520612487, "grad_norm": 8.445844650268555, "learning_rate": 0.0001060790599456071, "loss": 0.4467, "step": 643 }, { "epoch": 0.48546525323910483, "grad_norm": 8.920785903930664, "learning_rate": 0.00010584094246526237, "loss": 0.4593, "step": 644 }, { "epoch": 0.4862190812720848, "grad_norm": 9.759257316589355, "learning_rate": 0.00010560279174875179, "loss": 0.5054, "step": 645 }, { "epoch": 0.48697290930506476, "grad_norm": 9.649422645568848, "learning_rate": 0.0001053646091512019, "loss": 0.4891, "step": 646 }, { "epoch": 0.4877267373380448, "grad_norm": 9.831908226013184, "learning_rate": 0.00010512639602792088, "loss": 0.4805, "step": 647 }, { "epoch": 0.48848056537102474, "grad_norm": 11.026556968688965, "learning_rate": 0.00010488815373439036, "loss": 0.4875, "step": 648 }, { "epoch": 0.4892343934040047, "grad_norm": 10.98789119720459, "learning_rate": 0.00010464988362625812, "loss": 0.4852, "step": 649 }, { "epoch": 0.48998822143698467, "grad_norm": 12.804154396057129, "learning_rate": 0.00010441158705933016, "loss": 0.5069, "step": 650 }, { "epoch": 0.4907420494699647, "grad_norm": 7.31414270401001, "learning_rate": 0.00010417326538956305, "loss": 0.5666, "step": 651 }, { "epoch": 0.49149587750294466, "grad_norm": 7.537758827209473, "learning_rate": 0.00010393491997305613, "loss": 0.5592, "step": 652 }, { "epoch": 0.4922497055359246, "grad_norm": 7.580841064453125, "learning_rate": 0.00010369655216604397, "loss": 0.4984, "step": 653 }, { "epoch": 0.4930035335689046, "grad_norm": 7.048511028289795, "learning_rate": 0.0001034581633248885, "loss": 0.5271, "step": 654 }, { "epoch": 0.49375736160188455, "grad_norm": 6.32865047454834, "learning_rate": 0.00010321975480607129, "loss": 0.4999, "step": 655 }, { "epoch": 0.49451118963486457, "grad_norm": 5.981396675109863, "learning_rate": 0.00010298132796618596, "loss": 0.4717, "step": 656 }, { "epoch": 0.49526501766784453, "grad_norm": 5.971866130828857, "learning_rate": 0.00010274288416193034, "loss": 0.4357, "step": 657 }, { "epoch": 0.4960188457008245, "grad_norm": 5.870616912841797, "learning_rate": 0.0001025044247500988, "loss": 0.4475, "step": 658 }, { "epoch": 0.49677267373380446, "grad_norm": 6.04547119140625, "learning_rate": 0.00010226595108757451, "loss": 0.4641, "step": 659 }, { "epoch": 0.4975265017667845, "grad_norm": 6.311388969421387, "learning_rate": 0.00010202746453132172, "loss": 0.4697, "step": 660 }, { "epoch": 0.49828032979976444, "grad_norm": 5.957773208618164, "learning_rate": 0.00010178896643837809, "loss": 0.4381, "step": 661 }, { "epoch": 0.4990341578327444, "grad_norm": 6.014715671539307, "learning_rate": 0.00010155045816584691, "loss": 0.4429, "step": 662 }, { "epoch": 0.49978798586572437, "grad_norm": 5.99500846862793, "learning_rate": 0.00010131194107088935, "loss": 0.4544, "step": 663 }, { "epoch": 0.5005418138987043, "grad_norm": 6.102397918701172, "learning_rate": 0.00010107341651071684, "loss": 0.4437, "step": 664 }, { "epoch": 0.5005418138987043, "eval_loss": 0.44807884097099304, "eval_runtime": 126.4853, "eval_samples_per_second": 17.67, "eval_steps_per_second": 8.839, "step": 664 }, { "epoch": 0.5012956419316843, "grad_norm": 5.838627338409424, "learning_rate": 0.00010083488584258326, "loss": 0.3961, "step": 665 }, { "epoch": 0.5020494699646643, "grad_norm": 6.225624084472656, "learning_rate": 0.00010059635042377725, "loss": 0.4199, "step": 666 }, { "epoch": 0.5028032979976443, "grad_norm": 5.906275749206543, "learning_rate": 0.00010035781161161446, "loss": 0.4164, "step": 667 }, { "epoch": 0.5035571260306243, "grad_norm": 5.818455696105957, "learning_rate": 0.0001001192707634299, "loss": 0.3753, "step": 668 }, { "epoch": 0.5043109540636043, "grad_norm": 6.505937099456787, "learning_rate": 9.988072923657012e-05, "loss": 0.4058, "step": 669 }, { "epoch": 0.5050647820965842, "grad_norm": 6.205794811248779, "learning_rate": 9.964218838838554e-05, "loss": 0.4176, "step": 670 }, { "epoch": 0.5058186101295642, "grad_norm": 6.019129753112793, "learning_rate": 9.940364957622276e-05, "loss": 0.4253, "step": 671 }, { "epoch": 0.5065724381625442, "grad_norm": 5.988311290740967, "learning_rate": 9.916511415741676e-05, "loss": 0.399, "step": 672 }, { "epoch": 0.5073262661955241, "grad_norm": 6.607666492462158, "learning_rate": 9.892658348928316e-05, "loss": 0.4154, "step": 673 }, { "epoch": 0.5080800942285041, "grad_norm": 5.99027156829834, "learning_rate": 9.868805892911067e-05, "loss": 0.387, "step": 674 }, { "epoch": 0.508833922261484, "grad_norm": 6.09193229675293, "learning_rate": 9.84495418341531e-05, "loss": 0.3817, "step": 675 }, { "epoch": 0.5095877502944641, "grad_norm": 6.635573863983154, "learning_rate": 9.821103356162189e-05, "loss": 0.4021, "step": 676 }, { "epoch": 0.5103415783274441, "grad_norm": 6.2010884284973145, "learning_rate": 9.797253546867831e-05, "loss": 0.3915, "step": 677 }, { "epoch": 0.511095406360424, "grad_norm": 6.824472427368164, "learning_rate": 9.773404891242551e-05, "loss": 0.3946, "step": 678 }, { "epoch": 0.511849234393404, "grad_norm": 7.179849147796631, "learning_rate": 9.749557524990121e-05, "loss": 0.4281, "step": 679 }, { "epoch": 0.512603062426384, "grad_norm": 6.765272617340088, "learning_rate": 9.72571158380697e-05, "loss": 0.4113, "step": 680 }, { "epoch": 0.513356890459364, "grad_norm": 6.409517765045166, "learning_rate": 9.701867203381405e-05, "loss": 0.387, "step": 681 }, { "epoch": 0.5141107184923439, "grad_norm": 6.494263172149658, "learning_rate": 9.678024519392871e-05, "loss": 0.3783, "step": 682 }, { "epoch": 0.5148645465253239, "grad_norm": 6.259777545928955, "learning_rate": 9.654183667511154e-05, "loss": 0.3996, "step": 683 }, { "epoch": 0.5156183745583038, "grad_norm": 6.5478363037109375, "learning_rate": 9.630344783395604e-05, "loss": 0.3838, "step": 684 }, { "epoch": 0.5163722025912839, "grad_norm": 7.6854071617126465, "learning_rate": 9.606508002694386e-05, "loss": 0.4235, "step": 685 }, { "epoch": 0.5171260306242639, "grad_norm": 7.029118537902832, "learning_rate": 9.5826734610437e-05, "loss": 0.418, "step": 686 }, { "epoch": 0.5178798586572438, "grad_norm": 7.062952518463135, "learning_rate": 9.558841294066985e-05, "loss": 0.4281, "step": 687 }, { "epoch": 0.5186336866902238, "grad_norm": 6.547257900238037, "learning_rate": 9.535011637374189e-05, "loss": 0.4008, "step": 688 }, { "epoch": 0.5193875147232038, "grad_norm": 7.128522872924805, "learning_rate": 9.511184626560968e-05, "loss": 0.4072, "step": 689 }, { "epoch": 0.5201413427561837, "grad_norm": 6.604221343994141, "learning_rate": 9.487360397207916e-05, "loss": 0.3906, "step": 690 }, { "epoch": 0.5208951707891637, "grad_norm": 7.471280574798584, "learning_rate": 9.463539084879809e-05, "loss": 0.4373, "step": 691 }, { "epoch": 0.5216489988221437, "grad_norm": 7.444307804107666, "learning_rate": 9.439720825124827e-05, "loss": 0.4245, "step": 692 }, { "epoch": 0.5224028268551236, "grad_norm": 7.748506546020508, "learning_rate": 9.415905753473765e-05, "loss": 0.4267, "step": 693 }, { "epoch": 0.5231566548881037, "grad_norm": 8.47761344909668, "learning_rate": 9.392094005439291e-05, "loss": 0.4861, "step": 694 }, { "epoch": 0.5239104829210837, "grad_norm": 9.239935874938965, "learning_rate": 9.368285716515162e-05, "loss": 0.45, "step": 695 }, { "epoch": 0.5246643109540636, "grad_norm": 9.59188461303711, "learning_rate": 9.344481022175436e-05, "loss": 0.4876, "step": 696 }, { "epoch": 0.5254181389870436, "grad_norm": 10.498910903930664, "learning_rate": 9.320680057873735e-05, "loss": 0.5021, "step": 697 }, { "epoch": 0.5261719670200236, "grad_norm": 11.162120819091797, "learning_rate": 9.29688295904245e-05, "loss": 0.5001, "step": 698 }, { "epoch": 0.5269257950530035, "grad_norm": 11.781893730163574, "learning_rate": 9.273089861091969e-05, "loss": 0.456, "step": 699 }, { "epoch": 0.5276796230859835, "grad_norm": 15.090996742248535, "learning_rate": 9.249300899409924e-05, "loss": 0.5593, "step": 700 }, { "epoch": 0.5284334511189634, "grad_norm": 9.527992248535156, "learning_rate": 9.225516209360413e-05, "loss": 0.5803, "step": 701 }, { "epoch": 0.5291872791519434, "grad_norm": 8.856983184814453, "learning_rate": 9.201735926283213e-05, "loss": 0.5268, "step": 702 }, { "epoch": 0.5299411071849235, "grad_norm": 7.78725528717041, "learning_rate": 9.177960185493036e-05, "loss": 0.5227, "step": 703 }, { "epoch": 0.5306949352179035, "grad_norm": 7.152993679046631, "learning_rate": 9.154189122278754e-05, "loss": 0.5067, "step": 704 }, { "epoch": 0.5314487632508834, "grad_norm": 6.18569278717041, "learning_rate": 9.1304228719026e-05, "loss": 0.476, "step": 705 }, { "epoch": 0.5322025912838634, "grad_norm": 6.376234531402588, "learning_rate": 9.106661569599442e-05, "loss": 0.4734, "step": 706 }, { "epoch": 0.5329564193168433, "grad_norm": 6.275115489959717, "learning_rate": 9.082905350575986e-05, "loss": 0.4468, "step": 707 }, { "epoch": 0.5337102473498233, "grad_norm": 5.899405479431152, "learning_rate": 9.059154350010008e-05, "loss": 0.4738, "step": 708 }, { "epoch": 0.5344640753828033, "grad_norm": 6.213337421417236, "learning_rate": 9.035408703049596e-05, "loss": 0.4732, "step": 709 }, { "epoch": 0.5352179034157832, "grad_norm": 6.043967247009277, "learning_rate": 9.011668544812377e-05, "loss": 0.4514, "step": 710 }, { "epoch": 0.5359717314487632, "grad_norm": 6.495950698852539, "learning_rate": 8.987934010384733e-05, "loss": 0.4468, "step": 711 }, { "epoch": 0.5367255594817433, "grad_norm": 6.062058448791504, "learning_rate": 8.96420523482106e-05, "loss": 0.4311, "step": 712 }, { "epoch": 0.5374793875147232, "grad_norm": 6.561244964599609, "learning_rate": 8.940482353142983e-05, "loss": 0.4621, "step": 713 }, { "epoch": 0.5382332155477032, "grad_norm": 5.8635029792785645, "learning_rate": 8.916765500338575e-05, "loss": 0.4189, "step": 714 }, { "epoch": 0.5389870435806832, "grad_norm": 6.959576606750488, "learning_rate": 8.893054811361624e-05, "loss": 0.4382, "step": 715 }, { "epoch": 0.5397408716136631, "grad_norm": 5.93906307220459, "learning_rate": 8.869350421130831e-05, "loss": 0.4202, "step": 716 }, { "epoch": 0.5404946996466431, "grad_norm": 5.888154029846191, "learning_rate": 8.845652464529057e-05, "loss": 0.4098, "step": 717 }, { "epoch": 0.5412485276796231, "grad_norm": 6.113773345947266, "learning_rate": 8.821961076402563e-05, "loss": 0.412, "step": 718 }, { "epoch": 0.542002355712603, "grad_norm": 6.2954607009887695, "learning_rate": 8.79827639156022e-05, "loss": 0.4472, "step": 719 }, { "epoch": 0.542756183745583, "grad_norm": 6.085266590118408, "learning_rate": 8.774598544772774e-05, "loss": 0.4134, "step": 720 }, { "epoch": 0.5435100117785631, "grad_norm": 5.995761871337891, "learning_rate": 8.750927670772044e-05, "loss": 0.4236, "step": 721 }, { "epoch": 0.544263839811543, "grad_norm": 6.094368934631348, "learning_rate": 8.727263904250178e-05, "loss": 0.4344, "step": 722 }, { "epoch": 0.545017667844523, "grad_norm": 6.14577579498291, "learning_rate": 8.703607379858889e-05, "loss": 0.396, "step": 723 }, { "epoch": 0.545771495877503, "grad_norm": 5.814198970794678, "learning_rate": 8.679958232208668e-05, "loss": 0.3987, "step": 724 }, { "epoch": 0.5465253239104829, "grad_norm": 6.348716735839844, "learning_rate": 8.656316595868037e-05, "loss": 0.4263, "step": 725 }, { "epoch": 0.5472791519434629, "grad_norm": 6.51011323928833, "learning_rate": 8.632682605362784e-05, "loss": 0.4361, "step": 726 }, { "epoch": 0.5480329799764428, "grad_norm": 6.134734630584717, "learning_rate": 8.609056395175175e-05, "loss": 0.3946, "step": 727 }, { "epoch": 0.5487868080094228, "grad_norm": 6.129810333251953, "learning_rate": 8.585438099743217e-05, "loss": 0.3948, "step": 728 }, { "epoch": 0.5495406360424028, "grad_norm": 6.51365852355957, "learning_rate": 8.56182785345988e-05, "loss": 0.4182, "step": 729 }, { "epoch": 0.5502944640753828, "grad_norm": 6.257938861846924, "learning_rate": 8.538225790672322e-05, "loss": 0.4041, "step": 730 }, { "epoch": 0.5510482921083628, "grad_norm": 6.626195430755615, "learning_rate": 8.514632045681145e-05, "loss": 0.4291, "step": 731 }, { "epoch": 0.5518021201413428, "grad_norm": 6.350541591644287, "learning_rate": 8.491046752739624e-05, "loss": 0.4113, "step": 732 }, { "epoch": 0.5525559481743227, "grad_norm": 6.342377185821533, "learning_rate": 8.467470046052927e-05, "loss": 0.3725, "step": 733 }, { "epoch": 0.5533097762073027, "grad_norm": 6.338717460632324, "learning_rate": 8.443902059777373e-05, "loss": 0.4044, "step": 734 }, { "epoch": 0.5540636042402827, "grad_norm": 6.489543914794922, "learning_rate": 8.420342928019666e-05, "loss": 0.3806, "step": 735 }, { "epoch": 0.5548174322732626, "grad_norm": 6.675236701965332, "learning_rate": 8.396792784836108e-05, "loss": 0.3937, "step": 736 }, { "epoch": 0.5555712603062426, "grad_norm": 7.242746829986572, "learning_rate": 8.373251764231872e-05, "loss": 0.3968, "step": 737 }, { "epoch": 0.5563250883392226, "grad_norm": 6.987369537353516, "learning_rate": 8.349720000160218e-05, "loss": 0.3878, "step": 738 }, { "epoch": 0.5570789163722026, "grad_norm": 7.393560886383057, "learning_rate": 8.326197626521723e-05, "loss": 0.3883, "step": 739 }, { "epoch": 0.5578327444051826, "grad_norm": 7.474055290222168, "learning_rate": 8.30268477716354e-05, "loss": 0.4183, "step": 740 }, { "epoch": 0.5585865724381626, "grad_norm": 7.556806564331055, "learning_rate": 8.279181585878635e-05, "loss": 0.4282, "step": 741 }, { "epoch": 0.5593404004711425, "grad_norm": 8.794517517089844, "learning_rate": 8.255688186404996e-05, "loss": 0.4694, "step": 742 }, { "epoch": 0.5600942285041225, "grad_norm": 9.162858963012695, "learning_rate": 8.232204712424911e-05, "loss": 0.4888, "step": 743 }, { "epoch": 0.5608480565371025, "grad_norm": 9.154852867126465, "learning_rate": 8.208731297564189e-05, "loss": 0.4735, "step": 744 }, { "epoch": 0.5616018845700824, "grad_norm": 9.025120735168457, "learning_rate": 8.185268075391388e-05, "loss": 0.4743, "step": 745 }, { "epoch": 0.5623557126030624, "grad_norm": 9.328535079956055, "learning_rate": 8.161815179417078e-05, "loss": 0.4575, "step": 746 }, { "epoch": 0.5631095406360423, "grad_norm": 9.941339492797852, "learning_rate": 8.138372743093076e-05, "loss": 0.4969, "step": 747 }, { "epoch": 0.5638633686690224, "grad_norm": 9.928484916687012, "learning_rate": 8.114940899811662e-05, "loss": 0.4634, "step": 748 }, { "epoch": 0.5646171967020024, "grad_norm": 10.29101848602295, "learning_rate": 8.091519782904857e-05, "loss": 0.4114, "step": 749 }, { "epoch": 0.5653710247349824, "grad_norm": 15.212136268615723, "learning_rate": 8.068109525643647e-05, "loss": 0.516, "step": 750 }, { "epoch": 0.5661248527679623, "grad_norm": 8.223611831665039, "learning_rate": 8.044710261237207e-05, "loss": 0.541, "step": 751 }, { "epoch": 0.5668786808009423, "grad_norm": 8.392924308776855, "learning_rate": 8.021322122832178e-05, "loss": 0.5317, "step": 752 }, { "epoch": 0.5676325088339222, "grad_norm": 8.130448341369629, "learning_rate": 7.99794524351189e-05, "loss": 0.4935, "step": 753 }, { "epoch": 0.5683863368669022, "grad_norm": 6.9753899574279785, "learning_rate": 7.974579756295591e-05, "loss": 0.4941, "step": 754 }, { "epoch": 0.5691401648998822, "grad_norm": 6.365013122558594, "learning_rate": 7.951225794137724e-05, "loss": 0.4539, "step": 755 }, { "epoch": 0.5698939929328621, "grad_norm": 5.7341628074646, "learning_rate": 7.927883489927147e-05, "loss": 0.4197, "step": 756 }, { "epoch": 0.5706478209658422, "grad_norm": 6.036746025085449, "learning_rate": 7.904552976486372e-05, "loss": 0.4361, "step": 757 }, { "epoch": 0.5714016489988222, "grad_norm": 5.587414264678955, "learning_rate": 7.88123438657083e-05, "loss": 0.4294, "step": 758 }, { "epoch": 0.5721554770318021, "grad_norm": 5.824455738067627, "learning_rate": 7.857927852868107e-05, "loss": 0.426, "step": 759 }, { "epoch": 0.5729093050647821, "grad_norm": 5.811740398406982, "learning_rate": 7.83463350799717e-05, "loss": 0.4336, "step": 760 }, { "epoch": 0.5736631330977621, "grad_norm": 5.9260945320129395, "learning_rate": 7.811351484507647e-05, "loss": 0.4609, "step": 761 }, { "epoch": 0.574416961130742, "grad_norm": 6.589666843414307, "learning_rate": 7.788081914879051e-05, "loss": 0.4384, "step": 762 }, { "epoch": 0.575170789163722, "grad_norm": 5.957409858703613, "learning_rate": 7.764824931520018e-05, "loss": 0.4261, "step": 763 }, { "epoch": 0.575924617196702, "grad_norm": 6.138071060180664, "learning_rate": 7.741580666767583e-05, "loss": 0.4189, "step": 764 }, { "epoch": 0.5766784452296819, "grad_norm": 5.744472503662109, "learning_rate": 7.718349252886395e-05, "loss": 0.4086, "step": 765 }, { "epoch": 0.577432273262662, "grad_norm": 6.045204162597656, "learning_rate": 7.695130822067984e-05, "loss": 0.4306, "step": 766 }, { "epoch": 0.578186101295642, "grad_norm": 5.609772682189941, "learning_rate": 7.67192550643001e-05, "loss": 0.3998, "step": 767 }, { "epoch": 0.5789399293286219, "grad_norm": 5.921622276306152, "learning_rate": 7.648733438015493e-05, "loss": 0.4225, "step": 768 }, { "epoch": 0.5796937573616019, "grad_norm": 6.352652072906494, "learning_rate": 7.625554748792085e-05, "loss": 0.4193, "step": 769 }, { "epoch": 0.5804475853945819, "grad_norm": 6.210894584655762, "learning_rate": 7.602389570651303e-05, "loss": 0.4119, "step": 770 }, { "epoch": 0.5812014134275618, "grad_norm": 6.061959743499756, "learning_rate": 7.579238035407776e-05, "loss": 0.4097, "step": 771 }, { "epoch": 0.5819552414605418, "grad_norm": 6.42627477645874, "learning_rate": 7.556100274798519e-05, "loss": 0.4226, "step": 772 }, { "epoch": 0.5827090694935217, "grad_norm": 6.124332904815674, "learning_rate": 7.532976420482146e-05, "loss": 0.396, "step": 773 }, { "epoch": 0.5834628975265017, "grad_norm": 5.928023815155029, "learning_rate": 7.509866604038157e-05, "loss": 0.3897, "step": 774 }, { "epoch": 0.5842167255594818, "grad_norm": 6.037590503692627, "learning_rate": 7.486770956966171e-05, "loss": 0.3958, "step": 775 }, { "epoch": 0.5849705535924618, "grad_norm": 6.051185131072998, "learning_rate": 7.463689610685171e-05, "loss": 0.4072, "step": 776 }, { "epoch": 0.5857243816254417, "grad_norm": 6.234012126922607, "learning_rate": 7.440622696532775e-05, "loss": 0.4151, "step": 777 }, { "epoch": 0.5864782096584217, "grad_norm": 6.273362636566162, "learning_rate": 7.417570345764481e-05, "loss": 0.418, "step": 778 }, { "epoch": 0.5872320376914016, "grad_norm": 6.810718059539795, "learning_rate": 7.394532689552905e-05, "loss": 0.4082, "step": 779 }, { "epoch": 0.5879858657243816, "grad_norm": 7.068334102630615, "learning_rate": 7.371509858987061e-05, "loss": 0.4031, "step": 780 }, { "epoch": 0.5887396937573616, "grad_norm": 6.441345691680908, "learning_rate": 7.348501985071603e-05, "loss": 0.3973, "step": 781 }, { "epoch": 0.5894935217903415, "grad_norm": 6.285884380340576, "learning_rate": 7.325509198726064e-05, "loss": 0.3888, "step": 782 }, { "epoch": 0.5902473498233216, "grad_norm": 5.942330360412598, "learning_rate": 7.302531630784137e-05, "loss": 0.3656, "step": 783 }, { "epoch": 0.5910011778563016, "grad_norm": 6.333634376525879, "learning_rate": 7.279569411992926e-05, "loss": 0.4081, "step": 784 }, { "epoch": 0.5917550058892815, "grad_norm": 6.436288833618164, "learning_rate": 7.256622673012175e-05, "loss": 0.4118, "step": 785 }, { "epoch": 0.5925088339222615, "grad_norm": 6.464933395385742, "learning_rate": 7.233691544413558e-05, "loss": 0.4269, "step": 786 }, { "epoch": 0.5932626619552415, "grad_norm": 6.593018054962158, "learning_rate": 7.210776156679931e-05, "loss": 0.4124, "step": 787 }, { "epoch": 0.5940164899882214, "grad_norm": 6.8628363609313965, "learning_rate": 7.187876640204556e-05, "loss": 0.4109, "step": 788 }, { "epoch": 0.5947703180212014, "grad_norm": 7.0224151611328125, "learning_rate": 7.164993125290407e-05, "loss": 0.4141, "step": 789 }, { "epoch": 0.5955241460541814, "grad_norm": 6.763969421386719, "learning_rate": 7.1421257421494e-05, "loss": 0.4093, "step": 790 }, { "epoch": 0.5962779740871613, "grad_norm": 7.6155781745910645, "learning_rate": 7.119274620901649e-05, "loss": 0.413, "step": 791 }, { "epoch": 0.5970318021201414, "grad_norm": 7.919892311096191, "learning_rate": 7.096439891574745e-05, "loss": 0.422, "step": 792 }, { "epoch": 0.5977856301531214, "grad_norm": 9.18865966796875, "learning_rate": 7.073621684103007e-05, "loss": 0.4679, "step": 793 }, { "epoch": 0.5985394581861013, "grad_norm": 8.299490928649902, "learning_rate": 7.050820128326724e-05, "loss": 0.4638, "step": 794 }, { "epoch": 0.5992932862190813, "grad_norm": 9.120932579040527, "learning_rate": 7.028035353991456e-05, "loss": 0.451, "step": 795 }, { "epoch": 0.6000471142520613, "grad_norm": 9.830779075622559, "learning_rate": 7.005267490747263e-05, "loss": 0.4778, "step": 796 }, { "epoch": 0.6008009422850412, "grad_norm": 10.880460739135742, "learning_rate": 6.982516668147967e-05, "loss": 0.4404, "step": 797 }, { "epoch": 0.6015547703180212, "grad_norm": 10.648106575012207, "learning_rate": 6.959783015650446e-05, "loss": 0.5199, "step": 798 }, { "epoch": 0.6023085983510011, "grad_norm": 11.122642517089844, "learning_rate": 6.937066662613863e-05, "loss": 0.4476, "step": 799 }, { "epoch": 0.6030624263839811, "grad_norm": 12.062220573425293, "learning_rate": 6.914367738298941e-05, "loss": 0.4763, "step": 800 }, { "epoch": 0.6038162544169612, "grad_norm": 6.382950782775879, "learning_rate": 6.891686371867239e-05, "loss": 0.5237, "step": 801 }, { "epoch": 0.6045700824499411, "grad_norm": 7.342101097106934, "learning_rate": 6.869022692380411e-05, "loss": 0.51, "step": 802 }, { "epoch": 0.6053239104829211, "grad_norm": 7.170543670654297, "learning_rate": 6.846376828799451e-05, "loss": 0.4846, "step": 803 }, { "epoch": 0.6060777385159011, "grad_norm": 6.772843360900879, "learning_rate": 6.823748909983994e-05, "loss": 0.4899, "step": 804 }, { "epoch": 0.606831566548881, "grad_norm": 6.159712314605713, "learning_rate": 6.801139064691562e-05, "loss": 0.4651, "step": 805 }, { "epoch": 0.607585394581861, "grad_norm": 6.47841739654541, "learning_rate": 6.778547421576825e-05, "loss": 0.4699, "step": 806 }, { "epoch": 0.608339222614841, "grad_norm": 5.620822906494141, "learning_rate": 6.75597410919089e-05, "loss": 0.4317, "step": 807 }, { "epoch": 0.6090930506478209, "grad_norm": 5.6669392585754395, "learning_rate": 6.733419255980559e-05, "loss": 0.4504, "step": 808 }, { "epoch": 0.6098468786808009, "grad_norm": 5.989339828491211, "learning_rate": 6.710882990287585e-05, "loss": 0.4576, "step": 809 }, { "epoch": 0.610600706713781, "grad_norm": 5.7165751457214355, "learning_rate": 6.688365440347965e-05, "loss": 0.4179, "step": 810 }, { "epoch": 0.6113545347467609, "grad_norm": 6.0307087898254395, "learning_rate": 6.665866734291205e-05, "loss": 0.4815, "step": 811 }, { "epoch": 0.6121083627797409, "grad_norm": 6.319530010223389, "learning_rate": 6.643387000139565e-05, "loss": 0.4407, "step": 812 }, { "epoch": 0.6128621908127209, "grad_norm": 5.93934440612793, "learning_rate": 6.620926365807372e-05, "loss": 0.4081, "step": 813 }, { "epoch": 0.6136160188457008, "grad_norm": 5.771956443786621, "learning_rate": 6.598484959100257e-05, "loss": 0.3936, "step": 814 }, { "epoch": 0.6143698468786808, "grad_norm": 6.20790433883667, "learning_rate": 6.576062907714448e-05, "loss": 0.4513, "step": 815 }, { "epoch": 0.6151236749116608, "grad_norm": 5.739172458648682, "learning_rate": 6.553660339236041e-05, "loss": 0.399, "step": 816 }, { "epoch": 0.6158775029446407, "grad_norm": 6.355349540710449, "learning_rate": 6.53127738114026e-05, "loss": 0.4259, "step": 817 }, { "epoch": 0.6166313309776207, "grad_norm": 5.847348213195801, "learning_rate": 6.508914160790752e-05, "loss": 0.4091, "step": 818 }, { "epoch": 0.6173851590106008, "grad_norm": 5.917300224304199, "learning_rate": 6.486570805438843e-05, "loss": 0.4258, "step": 819 }, { "epoch": 0.6181389870435807, "grad_norm": 6.199348449707031, "learning_rate": 6.46424744222283e-05, "loss": 0.4054, "step": 820 }, { "epoch": 0.6188928150765607, "grad_norm": 6.075807571411133, "learning_rate": 6.441944198167253e-05, "loss": 0.4334, "step": 821 }, { "epoch": 0.6196466431095407, "grad_norm": 5.835407257080078, "learning_rate": 6.419661200182158e-05, "loss": 0.4124, "step": 822 }, { "epoch": 0.6204004711425206, "grad_norm": 6.856280326843262, "learning_rate": 6.397398575062396e-05, "loss": 0.4316, "step": 823 }, { "epoch": 0.6211542991755006, "grad_norm": 6.388029098510742, "learning_rate": 6.375156449486895e-05, "loss": 0.4096, "step": 824 }, { "epoch": 0.6219081272084805, "grad_norm": 6.334976673126221, "learning_rate": 6.352934950017921e-05, "loss": 0.4267, "step": 825 }, { "epoch": 0.6226619552414605, "grad_norm": 6.394600868225098, "learning_rate": 6.330734203100394e-05, "loss": 0.4151, "step": 826 }, { "epoch": 0.6234157832744405, "grad_norm": 6.139026165008545, "learning_rate": 6.308554335061135e-05, "loss": 0.4307, "step": 827 }, { "epoch": 0.6241696113074205, "grad_norm": 6.6982102394104, "learning_rate": 6.286395472108158e-05, "loss": 0.4285, "step": 828 }, { "epoch": 0.6249234393404005, "grad_norm": 5.852738857269287, "learning_rate": 6.26425774032996e-05, "loss": 0.3874, "step": 829 }, { "epoch": 0.6256772673733805, "grad_norm": 6.24067497253418, "learning_rate": 6.2421412656948e-05, "loss": 0.3924, "step": 830 }, { "epoch": 0.6264310954063604, "grad_norm": 6.479643821716309, "learning_rate": 6.220046174049968e-05, "loss": 0.4109, "step": 831 }, { "epoch": 0.6271849234393404, "grad_norm": 6.55532169342041, "learning_rate": 6.19797259112109e-05, "loss": 0.4151, "step": 832 }, { "epoch": 0.6279387514723204, "grad_norm": 5.995844841003418, "learning_rate": 6.175920642511404e-05, "loss": 0.3872, "step": 833 }, { "epoch": 0.6286925795053003, "grad_norm": 6.913110256195068, "learning_rate": 6.153890453701031e-05, "loss": 0.4105, "step": 834 }, { "epoch": 0.6294464075382803, "grad_norm": 6.36851692199707, "learning_rate": 6.131882150046291e-05, "loss": 0.4048, "step": 835 }, { "epoch": 0.6302002355712603, "grad_norm": 5.844064712524414, "learning_rate": 6.109895856778967e-05, "loss": 0.3689, "step": 836 }, { "epoch": 0.6309540636042403, "grad_norm": 7.132351398468018, "learning_rate": 6.087931699005588e-05, "loss": 0.4218, "step": 837 }, { "epoch": 0.6317078916372203, "grad_norm": 6.560583114624023, "learning_rate": 6.065989801706744e-05, "loss": 0.4053, "step": 838 }, { "epoch": 0.6324617196702003, "grad_norm": 6.6530351638793945, "learning_rate": 6.044070289736352e-05, "loss": 0.4061, "step": 839 }, { "epoch": 0.6332155477031802, "grad_norm": 6.5088677406311035, "learning_rate": 6.0221732878209425e-05, "loss": 0.376, "step": 840 }, { "epoch": 0.6339693757361602, "grad_norm": 6.723409175872803, "learning_rate": 6.0002989205589734e-05, "loss": 0.3978, "step": 841 }, { "epoch": 0.6347232037691402, "grad_norm": 9.00965404510498, "learning_rate": 5.978447312420103e-05, "loss": 0.4661, "step": 842 }, { "epoch": 0.6354770318021201, "grad_norm": 8.346488952636719, "learning_rate": 5.9566185877444755e-05, "loss": 0.4812, "step": 843 }, { "epoch": 0.6362308598351001, "grad_norm": 9.07754135131836, "learning_rate": 5.934812870742036e-05, "loss": 0.5042, "step": 844 }, { "epoch": 0.63698468786808, "grad_norm": 9.425755500793457, "learning_rate": 5.913030285491808e-05, "loss": 0.5273, "step": 845 }, { "epoch": 0.6377385159010601, "grad_norm": 8.991804122924805, "learning_rate": 5.891270955941184e-05, "loss": 0.4724, "step": 846 }, { "epoch": 0.6384923439340401, "grad_norm": 9.069438934326172, "learning_rate": 5.869535005905232e-05, "loss": 0.4694, "step": 847 }, { "epoch": 0.63924617196702, "grad_norm": 9.837794303894043, "learning_rate": 5.847822559065992e-05, "loss": 0.4601, "step": 848 }, { "epoch": 0.64, "grad_norm": 10.19363021850586, "learning_rate": 5.8261337389717506e-05, "loss": 0.4776, "step": 849 }, { "epoch": 0.64075382803298, "grad_norm": 11.673394203186035, "learning_rate": 5.804468669036369e-05, "loss": 0.4425, "step": 850 }, { "epoch": 0.6415076560659599, "grad_norm": 6.468347072601318, "learning_rate": 5.7828274725385544e-05, "loss": 0.5469, "step": 851 }, { "epoch": 0.6422614840989399, "grad_norm": 7.060529708862305, "learning_rate": 5.761210272621175e-05, "loss": 0.5067, "step": 852 }, { "epoch": 0.6430153121319199, "grad_norm": 7.569014072418213, "learning_rate": 5.739617192290545e-05, "loss": 0.5057, "step": 853 }, { "epoch": 0.6437691401648998, "grad_norm": 7.41010046005249, "learning_rate": 5.7180483544157546e-05, "loss": 0.4897, "step": 854 }, { "epoch": 0.6445229681978799, "grad_norm": 6.627238750457764, "learning_rate": 5.696503881727917e-05, "loss": 0.5036, "step": 855 }, { "epoch": 0.6452767962308599, "grad_norm": 6.318825721740723, "learning_rate": 5.6749838968195326e-05, "loss": 0.4619, "step": 856 }, { "epoch": 0.6460306242638398, "grad_norm": 5.585279941558838, "learning_rate": 5.653488522143744e-05, "loss": 0.4331, "step": 857 }, { "epoch": 0.6467844522968198, "grad_norm": 5.902019500732422, "learning_rate": 5.6320178800136626e-05, "loss": 0.4596, "step": 858 }, { "epoch": 0.6475382803297998, "grad_norm": 5.5325164794921875, "learning_rate": 5.610572092601659e-05, "loss": 0.4362, "step": 859 }, { "epoch": 0.6482921083627797, "grad_norm": 5.381384372711182, "learning_rate": 5.589151281938695e-05, "loss": 0.4294, "step": 860 }, { "epoch": 0.6490459363957597, "grad_norm": 6.080218315124512, "learning_rate": 5.56775556991358e-05, "loss": 0.4304, "step": 861 }, { "epoch": 0.6497997644287397, "grad_norm": 5.510005950927734, "learning_rate": 5.5463850782723346e-05, "loss": 0.4157, "step": 862 }, { "epoch": 0.6505535924617196, "grad_norm": 5.572638511657715, "learning_rate": 5.5250399286174546e-05, "loss": 0.4238, "step": 863 }, { "epoch": 0.6513074204946997, "grad_norm": 5.32048225402832, "learning_rate": 5.50372024240724e-05, "loss": 0.3929, "step": 864 }, { "epoch": 0.6520612485276797, "grad_norm": 5.80560827255249, "learning_rate": 5.48242614095509e-05, "loss": 0.4251, "step": 865 }, { "epoch": 0.6528150765606596, "grad_norm": 5.714180946350098, "learning_rate": 5.461157745428841e-05, "loss": 0.4318, "step": 866 }, { "epoch": 0.6535689045936396, "grad_norm": 5.553015232086182, "learning_rate": 5.439915176850037e-05, "loss": 0.3996, "step": 867 }, { "epoch": 0.6543227326266196, "grad_norm": 5.774811744689941, "learning_rate": 5.418698556093271e-05, "loss": 0.4298, "step": 868 }, { "epoch": 0.6550765606595995, "grad_norm": 5.804990291595459, "learning_rate": 5.397508003885483e-05, "loss": 0.4119, "step": 869 }, { "epoch": 0.6558303886925795, "grad_norm": 5.6263556480407715, "learning_rate": 5.3763436408052904e-05, "loss": 0.394, "step": 870 }, { "epoch": 0.6565842167255594, "grad_norm": 5.699732303619385, "learning_rate": 5.3552055872822636e-05, "loss": 0.4152, "step": 871 }, { "epoch": 0.6573380447585394, "grad_norm": 5.353825569152832, "learning_rate": 5.334093963596294e-05, "loss": 0.3798, "step": 872 }, { "epoch": 0.6580918727915195, "grad_norm": 5.929776668548584, "learning_rate": 5.313008889876865e-05, "loss": 0.4142, "step": 873 }, { "epoch": 0.6588457008244994, "grad_norm": 6.101897716522217, "learning_rate": 5.2919504861023903e-05, "loss": 0.4396, "step": 874 }, { "epoch": 0.6595995288574794, "grad_norm": 6.041595458984375, "learning_rate": 5.270918872099522e-05, "loss": 0.4455, "step": 875 }, { "epoch": 0.6603533568904594, "grad_norm": 5.795607566833496, "learning_rate": 5.249914167542486e-05, "loss": 0.3927, "step": 876 }, { "epoch": 0.6611071849234393, "grad_norm": 6.169924259185791, "learning_rate": 5.228936491952363e-05, "loss": 0.4022, "step": 877 }, { "epoch": 0.6618610129564193, "grad_norm": 5.870789527893066, "learning_rate": 5.207985964696462e-05, "loss": 0.4012, "step": 878 }, { "epoch": 0.6626148409893993, "grad_norm": 6.345909595489502, "learning_rate": 5.1870627049875954e-05, "loss": 0.3814, "step": 879 }, { "epoch": 0.6633686690223792, "grad_norm": 6.1364569664001465, "learning_rate": 5.16616683188342e-05, "loss": 0.4032, "step": 880 }, { "epoch": 0.6641224970553592, "grad_norm": 5.976447582244873, "learning_rate": 5.145298464285757e-05, "loss": 0.3814, "step": 881 }, { "epoch": 0.6648763250883393, "grad_norm": 7.229459285736084, "learning_rate": 5.12445772093992e-05, "loss": 0.4171, "step": 882 }, { "epoch": 0.6656301531213192, "grad_norm": 5.863222599029541, "learning_rate": 5.103644720434027e-05, "loss": 0.3782, "step": 883 }, { "epoch": 0.6663839811542992, "grad_norm": 6.049070835113525, "learning_rate": 5.082859581198344e-05, "loss": 0.3789, "step": 884 }, { "epoch": 0.6671378091872792, "grad_norm": 6.35960578918457, "learning_rate": 5.062102421504593e-05, "loss": 0.4086, "step": 885 }, { "epoch": 0.6678916372202591, "grad_norm": 6.470669746398926, "learning_rate": 5.041373359465289e-05, "loss": 0.4076, "step": 886 }, { "epoch": 0.6686454652532391, "grad_norm": 6.241630554199219, "learning_rate": 5.020672513033066e-05, "loss": 0.4007, "step": 887 }, { "epoch": 0.669399293286219, "grad_norm": 6.308516502380371, "learning_rate": 5.000000000000002e-05, "loss": 0.3754, "step": 888 }, { "epoch": 0.670153121319199, "grad_norm": 6.356692314147949, "learning_rate": 4.9793559379969566e-05, "loss": 0.3973, "step": 889 }, { "epoch": 0.670906949352179, "grad_norm": 7.087871074676514, "learning_rate": 4.958740444492892e-05, "loss": 0.4128, "step": 890 }, { "epoch": 0.6716607773851591, "grad_norm": 7.447615623474121, "learning_rate": 4.9381536367942195e-05, "loss": 0.4111, "step": 891 }, { "epoch": 0.672414605418139, "grad_norm": 7.260590076446533, "learning_rate": 4.917595632044113e-05, "loss": 0.3799, "step": 892 }, { "epoch": 0.673168433451119, "grad_norm": 7.701971530914307, "learning_rate": 4.8970665472218537e-05, "loss": 0.4017, "step": 893 }, { "epoch": 0.673922261484099, "grad_norm": 8.021989822387695, "learning_rate": 4.8765664991421634e-05, "loss": 0.4536, "step": 894 }, { "epoch": 0.6746760895170789, "grad_norm": 8.987250328063965, "learning_rate": 4.856095604454539e-05, "loss": 0.4939, "step": 895 }, { "epoch": 0.6754299175500589, "grad_norm": 10.436625480651855, "learning_rate": 4.835653979642585e-05, "loss": 0.5239, "step": 896 }, { "epoch": 0.6761837455830388, "grad_norm": 9.789538383483887, "learning_rate": 4.815241741023367e-05, "loss": 0.4798, "step": 897 }, { "epoch": 0.6769375736160188, "grad_norm": 9.678764343261719, "learning_rate": 4.7948590047467153e-05, "loss": 0.4441, "step": 898 }, { "epoch": 0.6776914016489988, "grad_norm": 10.444610595703125, "learning_rate": 4.774505886794609e-05, "loss": 0.4201, "step": 899 }, { "epoch": 0.6784452296819788, "grad_norm": 12.58081340789795, "learning_rate": 4.754182502980477e-05, "loss": 0.4634, "step": 900 }, { "epoch": 0.6791990577149588, "grad_norm": 5.85378885269165, "learning_rate": 4.7338889689485624e-05, "loss": 0.5182, "step": 901 }, { "epoch": 0.6799528857479388, "grad_norm": 6.6499857902526855, "learning_rate": 4.713625400173247e-05, "loss": 0.5216, "step": 902 }, { "epoch": 0.6807067137809187, "grad_norm": 6.543797016143799, "learning_rate": 4.693391911958426e-05, "loss": 0.4798, "step": 903 }, { "epoch": 0.6814605418138987, "grad_norm": 6.197330951690674, "learning_rate": 4.673188619436798e-05, "loss": 0.4892, "step": 904 }, { "epoch": 0.6822143698468787, "grad_norm": 6.185276031494141, "learning_rate": 4.6530156375692726e-05, "loss": 0.474, "step": 905 }, { "epoch": 0.6829681978798586, "grad_norm": 5.581246376037598, "learning_rate": 4.632873081144267e-05, "loss": 0.4498, "step": 906 }, { "epoch": 0.6837220259128386, "grad_norm": 5.916640281677246, "learning_rate": 4.6127610647770767e-05, "loss": 0.4619, "step": 907 }, { "epoch": 0.6844758539458186, "grad_norm": 5.591888904571533, "learning_rate": 4.592679702909216e-05, "loss": 0.4275, "step": 908 }, { "epoch": 0.6852296819787986, "grad_norm": 5.287500858306885, "learning_rate": 4.572629109807782e-05, "loss": 0.4073, "step": 909 }, { "epoch": 0.6859835100117786, "grad_norm": 5.325054168701172, "learning_rate": 4.552609399564762e-05, "loss": 0.3894, "step": 910 }, { "epoch": 0.6867373380447586, "grad_norm": 5.576198101043701, "learning_rate": 4.532620686096446e-05, "loss": 0.4185, "step": 911 }, { "epoch": 0.6874911660777385, "grad_norm": 5.555250644683838, "learning_rate": 4.5126630831427264e-05, "loss": 0.3818, "step": 912 }, { "epoch": 0.6882449941107185, "grad_norm": 5.309383869171143, "learning_rate": 4.492736704266475e-05, "loss": 0.3835, "step": 913 }, { "epoch": 0.6889988221436985, "grad_norm": 5.426351547241211, "learning_rate": 4.472841662852888e-05, "loss": 0.4087, "step": 914 }, { "epoch": 0.6897526501766784, "grad_norm": 5.882096767425537, "learning_rate": 4.452978072108859e-05, "loss": 0.4398, "step": 915 }, { "epoch": 0.6905064782096584, "grad_norm": 5.80626916885376, "learning_rate": 4.4331460450623064e-05, "loss": 0.4234, "step": 916 }, { "epoch": 0.6912603062426383, "grad_norm": 5.8705291748046875, "learning_rate": 4.413345694561549e-05, "loss": 0.4223, "step": 917 }, { "epoch": 0.6920141342756184, "grad_norm": 5.822587966918945, "learning_rate": 4.393577133274658e-05, "loss": 0.4314, "step": 918 }, { "epoch": 0.6927679623085984, "grad_norm": 6.2686872482299805, "learning_rate": 4.373840473688829e-05, "loss": 0.459, "step": 919 }, { "epoch": 0.6935217903415783, "grad_norm": 5.543201923370361, "learning_rate": 4.354135828109707e-05, "loss": 0.3963, "step": 920 }, { "epoch": 0.6942756183745583, "grad_norm": 5.7019267082214355, "learning_rate": 4.3344633086607955e-05, "loss": 0.3964, "step": 921 }, { "epoch": 0.6950294464075383, "grad_norm": 5.6861958503723145, "learning_rate": 4.3148230272827784e-05, "loss": 0.4175, "step": 922 }, { "epoch": 0.6957832744405182, "grad_norm": 5.791751384735107, "learning_rate": 4.295215095732904e-05, "loss": 0.4196, "step": 923 }, { "epoch": 0.6965371024734982, "grad_norm": 6.20761251449585, "learning_rate": 4.275639625584338e-05, "loss": 0.4159, "step": 924 }, { "epoch": 0.6972909305064782, "grad_norm": 6.440983772277832, "learning_rate": 4.256096728225548e-05, "loss": 0.418, "step": 925 }, { "epoch": 0.6980447585394581, "grad_norm": 5.713172435760498, "learning_rate": 4.236586514859633e-05, "loss": 0.4084, "step": 926 }, { "epoch": 0.6987985865724382, "grad_norm": 5.674785137176514, "learning_rate": 4.217109096503736e-05, "loss": 0.3978, "step": 927 }, { "epoch": 0.6995524146054182, "grad_norm": 6.123269081115723, "learning_rate": 4.197664583988376e-05, "loss": 0.421, "step": 928 }, { "epoch": 0.7003062426383981, "grad_norm": 5.961802959442139, "learning_rate": 4.1782530879568374e-05, "loss": 0.4027, "step": 929 }, { "epoch": 0.7010600706713781, "grad_norm": 6.020455360412598, "learning_rate": 4.1588747188645275e-05, "loss": 0.3978, "step": 930 }, { "epoch": 0.7018138987043581, "grad_norm": 5.788726329803467, "learning_rate": 4.1395295869783615e-05, "loss": 0.3744, "step": 931 }, { "epoch": 0.702567726737338, "grad_norm": 6.581162929534912, "learning_rate": 4.1202178023761195e-05, "loss": 0.4003, "step": 932 }, { "epoch": 0.703321554770318, "grad_norm": 5.601202011108398, "learning_rate": 4.100939474945843e-05, "loss": 0.37, "step": 933 }, { "epoch": 0.704075382803298, "grad_norm": 6.49223518371582, "learning_rate": 4.0816947143851816e-05, "loss": 0.4088, "step": 934 }, { "epoch": 0.7048292108362779, "grad_norm": 6.10722541809082, "learning_rate": 4.0624836302007886e-05, "loss": 0.3835, "step": 935 }, { "epoch": 0.705583038869258, "grad_norm": 6.136714935302734, "learning_rate": 4.0433063317076893e-05, "loss": 0.4056, "step": 936 }, { "epoch": 0.706336866902238, "grad_norm": 6.344220161437988, "learning_rate": 4.024162928028663e-05, "loss": 0.386, "step": 937 }, { "epoch": 0.7070906949352179, "grad_norm": 7.188864231109619, "learning_rate": 4.0050535280936205e-05, "loss": 0.3849, "step": 938 }, { "epoch": 0.7078445229681979, "grad_norm": 6.800889492034912, "learning_rate": 3.985978240638981e-05, "loss": 0.3989, "step": 939 }, { "epoch": 0.7085983510011779, "grad_norm": 7.130059242248535, "learning_rate": 3.966937174207066e-05, "loss": 0.3821, "step": 940 }, { "epoch": 0.7093521790341578, "grad_norm": 6.849576473236084, "learning_rate": 3.947930437145464e-05, "loss": 0.3843, "step": 941 }, { "epoch": 0.7101060070671378, "grad_norm": 7.004662036895752, "learning_rate": 3.928958137606421e-05, "loss": 0.3686, "step": 942 }, { "epoch": 0.7108598351001177, "grad_norm": 8.136757850646973, "learning_rate": 3.910020383546233e-05, "loss": 0.4558, "step": 943 }, { "epoch": 0.7116136631330977, "grad_norm": 8.616293907165527, "learning_rate": 3.8911172827246215e-05, "loss": 0.4368, "step": 944 }, { "epoch": 0.7123674911660778, "grad_norm": 8.701359748840332, "learning_rate": 3.8722489427041185e-05, "loss": 0.4512, "step": 945 }, { "epoch": 0.7131213191990577, "grad_norm": 9.437173843383789, "learning_rate": 3.853415470849479e-05, "loss": 0.481, "step": 946 }, { "epoch": 0.7138751472320377, "grad_norm": 10.383941650390625, "learning_rate": 3.834616974327021e-05, "loss": 0.5005, "step": 947 }, { "epoch": 0.7146289752650177, "grad_norm": 9.366165161132812, "learning_rate": 3.815853560104075e-05, "loss": 0.4548, "step": 948 }, { "epoch": 0.7153828032979976, "grad_norm": 9.855792999267578, "learning_rate": 3.7971253349483285e-05, "loss": 0.4908, "step": 949 }, { "epoch": 0.7161366313309776, "grad_norm": 11.261048316955566, "learning_rate": 3.7784324054272405e-05, "loss": 0.4601, "step": 950 }, { "epoch": 0.7168904593639576, "grad_norm": 5.492030143737793, "learning_rate": 3.759774877907428e-05, "loss": 0.5291, "step": 951 }, { "epoch": 0.7176442873969375, "grad_norm": 6.00732421875, "learning_rate": 3.741152858554077e-05, "loss": 0.5058, "step": 952 }, { "epoch": 0.7183981154299176, "grad_norm": 5.992036819458008, "learning_rate": 3.722566453330298e-05, "loss": 0.5028, "step": 953 }, { "epoch": 0.7191519434628976, "grad_norm": 5.949222564697266, "learning_rate": 3.7040157679965796e-05, "loss": 0.4631, "step": 954 }, { "epoch": 0.7199057714958775, "grad_norm": 5.833024978637695, "learning_rate": 3.6855009081101355e-05, "loss": 0.449, "step": 955 }, { "epoch": 0.7206595995288575, "grad_norm": 5.746013641357422, "learning_rate": 3.6670219790243344e-05, "loss": 0.4442, "step": 956 }, { "epoch": 0.7214134275618375, "grad_norm": 5.595402240753174, "learning_rate": 3.648579085888085e-05, "loss": 0.4353, "step": 957 }, { "epoch": 0.7221672555948174, "grad_norm": 5.437952995300293, "learning_rate": 3.630172333645261e-05, "loss": 0.434, "step": 958 }, { "epoch": 0.7229210836277974, "grad_norm": 5.620044231414795, "learning_rate": 3.611801827034059e-05, "loss": 0.4137, "step": 959 }, { "epoch": 0.7236749116607774, "grad_norm": 5.448288440704346, "learning_rate": 3.593467670586457e-05, "loss": 0.4197, "step": 960 }, { "epoch": 0.7244287396937573, "grad_norm": 5.672021389007568, "learning_rate": 3.5751699686275786e-05, "loss": 0.4495, "step": 961 }, { "epoch": 0.7251825677267374, "grad_norm": 5.292520046234131, "learning_rate": 3.556908825275117e-05, "loss": 0.4203, "step": 962 }, { "epoch": 0.7259363957597174, "grad_norm": 5.522578239440918, "learning_rate": 3.538684344438736e-05, "loss": 0.4043, "step": 963 }, { "epoch": 0.7266902237926973, "grad_norm": 5.811888694763184, "learning_rate": 3.520496629819494e-05, "loss": 0.4239, "step": 964 }, { "epoch": 0.7274440518256773, "grad_norm": 5.410277366638184, "learning_rate": 3.502345784909229e-05, "loss": 0.4163, "step": 965 }, { "epoch": 0.7281978798586572, "grad_norm": 5.810190677642822, "learning_rate": 3.484231912989989e-05, "loss": 0.4323, "step": 966 }, { "epoch": 0.7289517078916372, "grad_norm": 5.343920707702637, "learning_rate": 3.466155117133433e-05, "loss": 0.4153, "step": 967 }, { "epoch": 0.7297055359246172, "grad_norm": 5.489987373352051, "learning_rate": 3.448115500200263e-05, "loss": 0.3828, "step": 968 }, { "epoch": 0.7304593639575971, "grad_norm": 5.753129005432129, "learning_rate": 3.430113164839601e-05, "loss": 0.4047, "step": 969 }, { "epoch": 0.7312131919905771, "grad_norm": 5.8478569984436035, "learning_rate": 3.4121482134884575e-05, "loss": 0.4231, "step": 970 }, { "epoch": 0.7319670200235572, "grad_norm": 6.3078413009643555, "learning_rate": 3.3942207483710986e-05, "loss": 0.3913, "step": 971 }, { "epoch": 0.7327208480565371, "grad_norm": 5.719088077545166, "learning_rate": 3.3763308714984974e-05, "loss": 0.4149, "step": 972 }, { "epoch": 0.7334746760895171, "grad_norm": 5.784895420074463, "learning_rate": 3.358478684667734e-05, "loss": 0.3997, "step": 973 }, { "epoch": 0.7342285041224971, "grad_norm": 5.888166427612305, "learning_rate": 3.3406642894614394e-05, "loss": 0.4064, "step": 974 }, { "epoch": 0.734982332155477, "grad_norm": 6.573143482208252, "learning_rate": 3.3228877872471786e-05, "loss": 0.4188, "step": 975 }, { "epoch": 0.735736160188457, "grad_norm": 5.861452102661133, "learning_rate": 3.305149279176921e-05, "loss": 0.3993, "step": 976 }, { "epoch": 0.736489988221437, "grad_norm": 5.746969223022461, "learning_rate": 3.287448866186428e-05, "loss": 0.4014, "step": 977 }, { "epoch": 0.7372438162544169, "grad_norm": 5.95499849319458, "learning_rate": 3.269786648994697e-05, "loss": 0.4129, "step": 978 }, { "epoch": 0.7379976442873969, "grad_norm": 5.868785858154297, "learning_rate": 3.252162728103382e-05, "loss": 0.4006, "step": 979 }, { "epoch": 0.738751472320377, "grad_norm": 6.216129779815674, "learning_rate": 3.234577203796223e-05, "loss": 0.4097, "step": 980 }, { "epoch": 0.7395053003533569, "grad_norm": 5.94473934173584, "learning_rate": 3.217030176138474e-05, "loss": 0.3947, "step": 981 }, { "epoch": 0.7402591283863369, "grad_norm": 5.822911262512207, "learning_rate": 3.199521744976342e-05, "loss": 0.3838, "step": 982 }, { "epoch": 0.7410129564193169, "grad_norm": 5.968900203704834, "learning_rate": 3.182052009936404e-05, "loss": 0.3945, "step": 983 }, { "epoch": 0.7417667844522968, "grad_norm": 6.497354984283447, "learning_rate": 3.164621070425051e-05, "loss": 0.4138, "step": 984 }, { "epoch": 0.7425206124852768, "grad_norm": 6.382023334503174, "learning_rate": 3.147229025627922e-05, "loss": 0.37, "step": 985 }, { "epoch": 0.7432744405182568, "grad_norm": 6.162110328674316, "learning_rate": 3.129875974509332e-05, "loss": 0.3743, "step": 986 }, { "epoch": 0.7440282685512367, "grad_norm": 6.0412116050720215, "learning_rate": 3.1125620158117186e-05, "loss": 0.3714, "step": 987 }, { "epoch": 0.7447820965842167, "grad_norm": 6.072629451751709, "learning_rate": 3.095287248055069e-05, "loss": 0.369, "step": 988 }, { "epoch": 0.7455359246171968, "grad_norm": 6.4712958335876465, "learning_rate": 3.078051769536378e-05, "loss": 0.3956, "step": 989 }, { "epoch": 0.7462897526501767, "grad_norm": 6.292232036590576, "learning_rate": 3.060855678329063e-05, "loss": 0.3755, "step": 990 }, { "epoch": 0.7470435806831567, "grad_norm": 6.797161102294922, "learning_rate": 3.043699072282429e-05, "loss": 0.3941, "step": 991 }, { "epoch": 0.7477974087161366, "grad_norm": 7.063961029052734, "learning_rate": 3.0265820490210973e-05, "loss": 0.4085, "step": 992 }, { "epoch": 0.7485512367491166, "grad_norm": 8.036771774291992, "learning_rate": 3.0095047059444546e-05, "loss": 0.4553, "step": 993 }, { "epoch": 0.7493050647820966, "grad_norm": 8.343942642211914, "learning_rate": 2.9924671402261018e-05, "loss": 0.4532, "step": 994 }, { "epoch": 0.7500588928150765, "grad_norm": 8.597431182861328, "learning_rate": 2.9754694488133038e-05, "loss": 0.4544, "step": 995 }, { "epoch": 0.7508127208480565, "grad_norm": 8.797038078308105, "learning_rate": 2.958511728426414e-05, "loss": 0.4565, "step": 996 }, { "epoch": 0.7508127208480565, "eval_loss": 0.42347389459609985, "eval_runtime": 127.0592, "eval_samples_per_second": 17.59, "eval_steps_per_second": 8.799, "step": 996 }, { "epoch": 0.7515665488810365, "grad_norm": 9.90727710723877, "learning_rate": 2.941594075558366e-05, "loss": 0.4791, "step": 997 }, { "epoch": 0.7523203769140165, "grad_norm": 9.148994445800781, "learning_rate": 2.9247165864740856e-05, "loss": 0.4488, "step": 998 }, { "epoch": 0.7530742049469965, "grad_norm": 10.751917839050293, "learning_rate": 2.9078793572099616e-05, "loss": 0.4695, "step": 999 }, { "epoch": 0.7538280329799765, "grad_norm": 12.66123104095459, "learning_rate": 2.8910824835732952e-05, "loss": 0.4773, "step": 1000 }, { "epoch": 0.7545818610129564, "grad_norm": 5.507136821746826, "learning_rate": 2.8743260611417665e-05, "loss": 0.5073, "step": 1001 }, { "epoch": 0.7553356890459364, "grad_norm": 5.805990695953369, "learning_rate": 2.857610185262859e-05, "loss": 0.4735, "step": 1002 }, { "epoch": 0.7560895170789164, "grad_norm": 5.612555980682373, "learning_rate": 2.8409349510533578e-05, "loss": 0.4536, "step": 1003 }, { "epoch": 0.7568433451118963, "grad_norm": 5.850246906280518, "learning_rate": 2.8243004533987793e-05, "loss": 0.4578, "step": 1004 }, { "epoch": 0.7575971731448763, "grad_norm": 5.569720268249512, "learning_rate": 2.8077067869528417e-05, "loss": 0.4135, "step": 1005 }, { "epoch": 0.7583510011778563, "grad_norm": 5.9112114906311035, "learning_rate": 2.7911540461369222e-05, "loss": 0.4445, "step": 1006 }, { "epoch": 0.7591048292108363, "grad_norm": 5.9236249923706055, "learning_rate": 2.774642325139535e-05, "loss": 0.4402, "step": 1007 }, { "epoch": 0.7598586572438163, "grad_norm": 6.210232257843018, "learning_rate": 2.7581717179157606e-05, "loss": 0.4605, "step": 1008 }, { "epoch": 0.7606124852767963, "grad_norm": 5.880030155181885, "learning_rate": 2.7417423181867585e-05, "loss": 0.4227, "step": 1009 }, { "epoch": 0.7613663133097762, "grad_norm": 5.549881458282471, "learning_rate": 2.72535421943919e-05, "loss": 0.4168, "step": 1010 }, { "epoch": 0.7621201413427562, "grad_norm": 5.586158275604248, "learning_rate": 2.7090075149247217e-05, "loss": 0.4334, "step": 1011 }, { "epoch": 0.7628739693757361, "grad_norm": 5.5952348709106445, "learning_rate": 2.6927022976594607e-05, "loss": 0.4232, "step": 1012 }, { "epoch": 0.7636277974087161, "grad_norm": 5.478029727935791, "learning_rate": 2.676438660423457e-05, "loss": 0.4053, "step": 1013 }, { "epoch": 0.7643816254416961, "grad_norm": 5.441522121429443, "learning_rate": 2.660216695760157e-05, "loss": 0.3847, "step": 1014 }, { "epoch": 0.765135453474676, "grad_norm": 5.584785461425781, "learning_rate": 2.6440364959758813e-05, "loss": 0.4098, "step": 1015 }, { "epoch": 0.7658892815076561, "grad_norm": 5.545854091644287, "learning_rate": 2.6278981531392945e-05, "loss": 0.4002, "step": 1016 }, { "epoch": 0.7666431095406361, "grad_norm": 5.697778701782227, "learning_rate": 2.6118017590809017e-05, "loss": 0.4013, "step": 1017 }, { "epoch": 0.767396937573616, "grad_norm": 6.265735626220703, "learning_rate": 2.595747405392491e-05, "loss": 0.4102, "step": 1018 }, { "epoch": 0.768150765606596, "grad_norm": 5.284882545471191, "learning_rate": 2.579735183426649e-05, "loss": 0.3747, "step": 1019 }, { "epoch": 0.768904593639576, "grad_norm": 5.939345359802246, "learning_rate": 2.5637651842962164e-05, "loss": 0.4019, "step": 1020 }, { "epoch": 0.7696584216725559, "grad_norm": 5.655182838439941, "learning_rate": 2.5478374988737753e-05, "loss": 0.4038, "step": 1021 }, { "epoch": 0.7704122497055359, "grad_norm": 5.510229587554932, "learning_rate": 2.531952217791136e-05, "loss": 0.3912, "step": 1022 }, { "epoch": 0.7711660777385159, "grad_norm": 5.720643997192383, "learning_rate": 2.5161094314388278e-05, "loss": 0.3995, "step": 1023 }, { "epoch": 0.7719199057714958, "grad_norm": 5.860435962677002, "learning_rate": 2.5003092299655584e-05, "loss": 0.3995, "step": 1024 }, { "epoch": 0.7726737338044759, "grad_norm": 6.223293304443359, "learning_rate": 2.4845517032777364e-05, "loss": 0.4424, "step": 1025 }, { "epoch": 0.7734275618374559, "grad_norm": 6.027644157409668, "learning_rate": 2.4688369410389334e-05, "loss": 0.4299, "step": 1026 }, { "epoch": 0.7741813898704358, "grad_norm": 5.946674346923828, "learning_rate": 2.4531650326693822e-05, "loss": 0.3849, "step": 1027 }, { "epoch": 0.7749352179034158, "grad_norm": 6.277134895324707, "learning_rate": 2.4375360673454718e-05, "loss": 0.4147, "step": 1028 }, { "epoch": 0.7756890459363958, "grad_norm": 6.024038314819336, "learning_rate": 2.4219501339992334e-05, "loss": 0.3774, "step": 1029 }, { "epoch": 0.7764428739693757, "grad_norm": 5.8574910163879395, "learning_rate": 2.406407321317835e-05, "loss": 0.3865, "step": 1030 }, { "epoch": 0.7771967020023557, "grad_norm": 6.022578239440918, "learning_rate": 2.3909077177430893e-05, "loss": 0.3957, "step": 1031 }, { "epoch": 0.7779505300353357, "grad_norm": 5.923416614532471, "learning_rate": 2.3754514114709304e-05, "loss": 0.3836, "step": 1032 }, { "epoch": 0.7787043580683156, "grad_norm": 6.270270824432373, "learning_rate": 2.3600384904509254e-05, "loss": 0.3979, "step": 1033 }, { "epoch": 0.7794581861012957, "grad_norm": 6.285928726196289, "learning_rate": 2.3446690423857685e-05, "loss": 0.4098, "step": 1034 }, { "epoch": 0.7802120141342757, "grad_norm": 6.104770660400391, "learning_rate": 2.3293431547307887e-05, "loss": 0.3746, "step": 1035 }, { "epoch": 0.7809658421672556, "grad_norm": 6.284374237060547, "learning_rate": 2.31406091469344e-05, "loss": 0.3933, "step": 1036 }, { "epoch": 0.7817196702002356, "grad_norm": 6.502585411071777, "learning_rate": 2.298822409232817e-05, "loss": 0.3964, "step": 1037 }, { "epoch": 0.7824734982332155, "grad_norm": 6.121708869934082, "learning_rate": 2.2836277250591574e-05, "loss": 0.3822, "step": 1038 }, { "epoch": 0.7832273262661955, "grad_norm": 7.069113731384277, "learning_rate": 2.2684769486333445e-05, "loss": 0.3919, "step": 1039 }, { "epoch": 0.7839811542991755, "grad_norm": 6.825623035430908, "learning_rate": 2.2533701661664154e-05, "loss": 0.4296, "step": 1040 }, { "epoch": 0.7847349823321554, "grad_norm": 7.632999897003174, "learning_rate": 2.2383074636190748e-05, "loss": 0.4266, "step": 1041 }, { "epoch": 0.7854888103651354, "grad_norm": 7.41874885559082, "learning_rate": 2.2232889267012038e-05, "loss": 0.4263, "step": 1042 }, { "epoch": 0.7862426383981155, "grad_norm": 7.6582417488098145, "learning_rate": 2.2083146408713673e-05, "loss": 0.4351, "step": 1043 }, { "epoch": 0.7869964664310954, "grad_norm": 9.17532730102539, "learning_rate": 2.1933846913363466e-05, "loss": 0.5107, "step": 1044 }, { "epoch": 0.7877502944640754, "grad_norm": 9.609545707702637, "learning_rate": 2.178499163050617e-05, "loss": 0.4606, "step": 1045 }, { "epoch": 0.7885041224970554, "grad_norm": 9.567949295043945, "learning_rate": 2.1636581407159105e-05, "loss": 0.4663, "step": 1046 }, { "epoch": 0.7892579505300353, "grad_norm": 9.527708053588867, "learning_rate": 2.1488617087806982e-05, "loss": 0.4712, "step": 1047 }, { "epoch": 0.7900117785630153, "grad_norm": 9.680562973022461, "learning_rate": 2.1341099514397266e-05, "loss": 0.4975, "step": 1048 }, { "epoch": 0.7907656065959953, "grad_norm": 10.399216651916504, "learning_rate": 2.1194029526335303e-05, "loss": 0.4586, "step": 1049 }, { "epoch": 0.7915194346289752, "grad_norm": 10.869539260864258, "learning_rate": 2.1047407960479702e-05, "loss": 0.4429, "step": 1050 }, { "epoch": 0.7922732626619552, "grad_norm": 5.385607719421387, "learning_rate": 2.0901235651137284e-05, "loss": 0.5019, "step": 1051 }, { "epoch": 0.7930270906949353, "grad_norm": 5.6260223388671875, "learning_rate": 2.0755513430058672e-05, "loss": 0.4988, "step": 1052 }, { "epoch": 0.7937809187279152, "grad_norm": 5.487570762634277, "learning_rate": 2.0610242126433297e-05, "loss": 0.4594, "step": 1053 }, { "epoch": 0.7945347467608952, "grad_norm": 5.6461591720581055, "learning_rate": 2.0465422566884805e-05, "loss": 0.4642, "step": 1054 }, { "epoch": 0.7952885747938752, "grad_norm": 5.7345123291015625, "learning_rate": 2.0321055575466284e-05, "loss": 0.4442, "step": 1055 }, { "epoch": 0.7960424028268551, "grad_norm": 5.918202877044678, "learning_rate": 2.0177141973655766e-05, "loss": 0.4708, "step": 1056 }, { "epoch": 0.7967962308598351, "grad_norm": 5.593347549438477, "learning_rate": 2.0033682580351144e-05, "loss": 0.4277, "step": 1057 }, { "epoch": 0.797550058892815, "grad_norm": 5.557769775390625, "learning_rate": 1.9890678211866033e-05, "loss": 0.4267, "step": 1058 }, { "epoch": 0.798303886925795, "grad_norm": 5.38918924331665, "learning_rate": 1.9748129681924675e-05, "loss": 0.4112, "step": 1059 }, { "epoch": 0.799057714958775, "grad_norm": 5.82417631149292, "learning_rate": 1.9606037801657673e-05, "loss": 0.4104, "step": 1060 }, { "epoch": 0.799811542991755, "grad_norm": 5.548363208770752, "learning_rate": 1.9464403379596963e-05, "loss": 0.4127, "step": 1061 }, { "epoch": 0.800565371024735, "grad_norm": 5.548163890838623, "learning_rate": 1.932322722167168e-05, "loss": 0.4198, "step": 1062 }, { "epoch": 0.801319199057715, "grad_norm": 5.443014621734619, "learning_rate": 1.9182510131203224e-05, "loss": 0.4012, "step": 1063 }, { "epoch": 0.802073027090695, "grad_norm": 5.750105381011963, "learning_rate": 1.9042252908900814e-05, "loss": 0.4075, "step": 1064 }, { "epoch": 0.8028268551236749, "grad_norm": 5.6281418800354, "learning_rate": 1.8902456352856925e-05, "loss": 0.3896, "step": 1065 }, { "epoch": 0.8035806831566549, "grad_norm": 5.443961143493652, "learning_rate": 1.8763121258542815e-05, "loss": 0.4057, "step": 1066 }, { "epoch": 0.8043345111896348, "grad_norm": 5.808502674102783, "learning_rate": 1.86242484188038e-05, "loss": 0.4137, "step": 1067 }, { "epoch": 0.8050883392226148, "grad_norm": 5.866790294647217, "learning_rate": 1.848583862385501e-05, "loss": 0.4129, "step": 1068 }, { "epoch": 0.8058421672555948, "grad_norm": 5.517582893371582, "learning_rate": 1.8347892661276656e-05, "loss": 0.3901, "step": 1069 }, { "epoch": 0.8065959952885748, "grad_norm": 6.088197231292725, "learning_rate": 1.82104113160097e-05, "loss": 0.4125, "step": 1070 }, { "epoch": 0.8073498233215548, "grad_norm": 5.613511562347412, "learning_rate": 1.8073395370351287e-05, "loss": 0.3968, "step": 1071 }, { "epoch": 0.8081036513545348, "grad_norm": 5.712565898895264, "learning_rate": 1.7936845603950447e-05, "loss": 0.3925, "step": 1072 }, { "epoch": 0.8088574793875147, "grad_norm": 5.371545314788818, "learning_rate": 1.780076279380337e-05, "loss": 0.3589, "step": 1073 }, { "epoch": 0.8096113074204947, "grad_norm": 5.599592208862305, "learning_rate": 1.7665147714249376e-05, "loss": 0.3838, "step": 1074 }, { "epoch": 0.8103651354534747, "grad_norm": 6.015298843383789, "learning_rate": 1.753000113696617e-05, "loss": 0.386, "step": 1075 }, { "epoch": 0.8111189634864546, "grad_norm": 5.434444427490234, "learning_rate": 1.7395323830965605e-05, "loss": 0.3771, "step": 1076 }, { "epoch": 0.8118727915194346, "grad_norm": 6.145053863525391, "learning_rate": 1.726111656258932e-05, "loss": 0.4039, "step": 1077 }, { "epoch": 0.8126266195524146, "grad_norm": 5.7801384925842285, "learning_rate": 1.7127380095504296e-05, "loss": 0.3955, "step": 1078 }, { "epoch": 0.8133804475853946, "grad_norm": 5.640938758850098, "learning_rate": 1.699411519069858e-05, "loss": 0.3788, "step": 1079 }, { "epoch": 0.8141342756183746, "grad_norm": 5.714921951293945, "learning_rate": 1.686132260647696e-05, "loss": 0.3637, "step": 1080 }, { "epoch": 0.8148881036513546, "grad_norm": 6.3913750648498535, "learning_rate": 1.6729003098456576e-05, "loss": 0.3815, "step": 1081 }, { "epoch": 0.8156419316843345, "grad_norm": 5.981407642364502, "learning_rate": 1.6597157419562703e-05, "loss": 0.3756, "step": 1082 }, { "epoch": 0.8163957597173145, "grad_norm": 6.408857822418213, "learning_rate": 1.646578632002439e-05, "loss": 0.4219, "step": 1083 }, { "epoch": 0.8171495877502944, "grad_norm": 6.3557329177856445, "learning_rate": 1.6334890547370286e-05, "loss": 0.387, "step": 1084 }, { "epoch": 0.8179034157832744, "grad_norm": 6.406612873077393, "learning_rate": 1.6204470846424268e-05, "loss": 0.3736, "step": 1085 }, { "epoch": 0.8186572438162544, "grad_norm": 6.225420951843262, "learning_rate": 1.607452795930131e-05, "loss": 0.3886, "step": 1086 }, { "epoch": 0.8194110718492343, "grad_norm": 6.3113789558410645, "learning_rate": 1.594506262540324e-05, "loss": 0.402, "step": 1087 }, { "epoch": 0.8201648998822144, "grad_norm": 6.504429817199707, "learning_rate": 1.5816075581414458e-05, "loss": 0.3911, "step": 1088 }, { "epoch": 0.8209187279151944, "grad_norm": 7.651139736175537, "learning_rate": 1.56875675612978e-05, "loss": 0.4127, "step": 1089 }, { "epoch": 0.8216725559481743, "grad_norm": 6.864494800567627, "learning_rate": 1.5559539296290403e-05, "loss": 0.3841, "step": 1090 }, { "epoch": 0.8224263839811543, "grad_norm": 7.120053291320801, "learning_rate": 1.5431991514899446e-05, "loss": 0.4185, "step": 1091 }, { "epoch": 0.8231802120141343, "grad_norm": 7.861664295196533, "learning_rate": 1.5304924942898068e-05, "loss": 0.4293, "step": 1092 }, { "epoch": 0.8239340400471142, "grad_norm": 8.355661392211914, "learning_rate": 1.5178340303321314e-05, "loss": 0.4559, "step": 1093 }, { "epoch": 0.8246878680800942, "grad_norm": 8.859525680541992, "learning_rate": 1.5052238316461753e-05, "loss": 0.4503, "step": 1094 }, { "epoch": 0.8254416961130742, "grad_norm": 9.211348533630371, "learning_rate": 1.492661969986574e-05, "loss": 0.4435, "step": 1095 }, { "epoch": 0.8261955241460541, "grad_norm": 8.610541343688965, "learning_rate": 1.4801485168329066e-05, "loss": 0.4625, "step": 1096 }, { "epoch": 0.8269493521790342, "grad_norm": 10.033802032470703, "learning_rate": 1.4676835433892989e-05, "loss": 0.437, "step": 1097 }, { "epoch": 0.8277031802120142, "grad_norm": 10.607207298278809, "learning_rate": 1.4552671205840163e-05, "loss": 0.4369, "step": 1098 }, { "epoch": 0.8284570082449941, "grad_norm": 10.07897663116455, "learning_rate": 1.4428993190690677e-05, "loss": 0.4563, "step": 1099 }, { "epoch": 0.8292108362779741, "grad_norm": 12.518508911132812, "learning_rate": 1.4305802092197829e-05, "loss": 0.4645, "step": 1100 }, { "epoch": 0.8299646643109541, "grad_norm": 5.578033924102783, "learning_rate": 1.4183098611344415e-05, "loss": 0.51, "step": 1101 }, { "epoch": 0.830718492343934, "grad_norm": 5.301563739776611, "learning_rate": 1.4060883446338502e-05, "loss": 0.4486, "step": 1102 }, { "epoch": 0.831472320376914, "grad_norm": 5.3994293212890625, "learning_rate": 1.393915729260955e-05, "loss": 0.4536, "step": 1103 }, { "epoch": 0.832226148409894, "grad_norm": 5.560753345489502, "learning_rate": 1.3817920842804433e-05, "loss": 0.455, "step": 1104 }, { "epoch": 0.8329799764428739, "grad_norm": 5.810977935791016, "learning_rate": 1.3697174786783584e-05, "loss": 0.4373, "step": 1105 }, { "epoch": 0.833733804475854, "grad_norm": 5.4894256591796875, "learning_rate": 1.3576919811616862e-05, "loss": 0.4106, "step": 1106 }, { "epoch": 0.834487632508834, "grad_norm": 5.865782737731934, "learning_rate": 1.345715660157989e-05, "loss": 0.4151, "step": 1107 }, { "epoch": 0.8352414605418139, "grad_norm": 5.4949469566345215, "learning_rate": 1.3337885838149988e-05, "loss": 0.4422, "step": 1108 }, { "epoch": 0.8359952885747939, "grad_norm": 5.45637845993042, "learning_rate": 1.3219108200002418e-05, "loss": 0.4237, "step": 1109 }, { "epoch": 0.8367491166077738, "grad_norm": 5.681154251098633, "learning_rate": 1.3100824363006326e-05, "loss": 0.443, "step": 1110 }, { "epoch": 0.8375029446407538, "grad_norm": 5.729828357696533, "learning_rate": 1.2983035000221177e-05, "loss": 0.4053, "step": 1111 }, { "epoch": 0.8382567726737338, "grad_norm": 6.101329326629639, "learning_rate": 1.2865740781892699e-05, "loss": 0.4384, "step": 1112 }, { "epoch": 0.8390106007067137, "grad_norm": 5.694645881652832, "learning_rate": 1.2748942375449135e-05, "loss": 0.4013, "step": 1113 }, { "epoch": 0.8397644287396937, "grad_norm": 5.564671516418457, "learning_rate": 1.263264044549748e-05, "loss": 0.4148, "step": 1114 }, { "epoch": 0.8405182567726738, "grad_norm": 5.393068313598633, "learning_rate": 1.2516835653819725e-05, "loss": 0.3981, "step": 1115 }, { "epoch": 0.8412720848056537, "grad_norm": 5.637123107910156, "learning_rate": 1.2401528659368911e-05, "loss": 0.406, "step": 1116 }, { "epoch": 0.8420259128386337, "grad_norm": 5.908216953277588, "learning_rate": 1.2286720118265659e-05, "loss": 0.3637, "step": 1117 }, { "epoch": 0.8427797408716137, "grad_norm": 5.7352070808410645, "learning_rate": 1.2172410683794177e-05, "loss": 0.4082, "step": 1118 }, { "epoch": 0.8435335689045936, "grad_norm": 5.5727858543396, "learning_rate": 1.2058601006398718e-05, "loss": 0.3828, "step": 1119 }, { "epoch": 0.8442873969375736, "grad_norm": 6.22990608215332, "learning_rate": 1.1945291733679764e-05, "loss": 0.4306, "step": 1120 }, { "epoch": 0.8450412249705536, "grad_norm": 5.981517314910889, "learning_rate": 1.1832483510390469e-05, "loss": 0.4177, "step": 1121 }, { "epoch": 0.8457950530035335, "grad_norm": 5.5717973709106445, "learning_rate": 1.1720176978432795e-05, "loss": 0.375, "step": 1122 }, { "epoch": 0.8465488810365136, "grad_norm": 5.83533239364624, "learning_rate": 1.1608372776854103e-05, "loss": 0.4141, "step": 1123 }, { "epoch": 0.8473027090694936, "grad_norm": 5.770301342010498, "learning_rate": 1.1497071541843306e-05, "loss": 0.3698, "step": 1124 }, { "epoch": 0.8480565371024735, "grad_norm": 5.999599933624268, "learning_rate": 1.1386273906727363e-05, "loss": 0.4177, "step": 1125 }, { "epoch": 0.8488103651354535, "grad_norm": 5.716385841369629, "learning_rate": 1.1275980501967642e-05, "loss": 0.3931, "step": 1126 }, { "epoch": 0.8495641931684335, "grad_norm": 6.15166711807251, "learning_rate": 1.1166191955156346e-05, "loss": 0.4025, "step": 1127 }, { "epoch": 0.8503180212014134, "grad_norm": 6.117612361907959, "learning_rate": 1.1056908891012884e-05, "loss": 0.4186, "step": 1128 }, { "epoch": 0.8510718492343934, "grad_norm": 6.109333038330078, "learning_rate": 1.0948131931380457e-05, "loss": 0.3863, "step": 1129 }, { "epoch": 0.8518256772673733, "grad_norm": 5.863979816436768, "learning_rate": 1.0839861695222354e-05, "loss": 0.3737, "step": 1130 }, { "epoch": 0.8525795053003533, "grad_norm": 5.980686664581299, "learning_rate": 1.0732098798618517e-05, "loss": 0.3739, "step": 1131 }, { "epoch": 0.8533333333333334, "grad_norm": 6.321891784667969, "learning_rate": 1.0624843854762034e-05, "loss": 0.416, "step": 1132 }, { "epoch": 0.8540871613663134, "grad_norm": 6.081487655639648, "learning_rate": 1.0518097473955624e-05, "loss": 0.3922, "step": 1133 }, { "epoch": 0.8548409893992933, "grad_norm": 6.287003040313721, "learning_rate": 1.0411860263608186e-05, "loss": 0.3747, "step": 1134 }, { "epoch": 0.8555948174322733, "grad_norm": 6.175232887268066, "learning_rate": 1.0306132828231318e-05, "loss": 0.3708, "step": 1135 }, { "epoch": 0.8563486454652532, "grad_norm": 6.49648904800415, "learning_rate": 1.0200915769435937e-05, "loss": 0.373, "step": 1136 }, { "epoch": 0.8571024734982332, "grad_norm": 6.249892234802246, "learning_rate": 1.009620968592876e-05, "loss": 0.3807, "step": 1137 }, { "epoch": 0.8578563015312132, "grad_norm": 6.616731643676758, "learning_rate": 9.992015173508995e-06, "loss": 0.3981, "step": 1138 }, { "epoch": 0.8586101295641931, "grad_norm": 6.801102638244629, "learning_rate": 9.88833282506486e-06, "loss": 0.3968, "step": 1139 }, { "epoch": 0.8593639575971731, "grad_norm": 6.820323467254639, "learning_rate": 9.785163230570282e-06, "loss": 0.3939, "step": 1140 }, { "epoch": 0.8601177856301532, "grad_norm": 8.20490837097168, "learning_rate": 9.682506977081496e-06, "loss": 0.4353, "step": 1141 }, { "epoch": 0.8608716136631331, "grad_norm": 7.587864398956299, "learning_rate": 9.580364648733775e-06, "loss": 0.4369, "step": 1142 }, { "epoch": 0.8616254416961131, "grad_norm": 7.294688701629639, "learning_rate": 9.478736826737944e-06, "loss": 0.411, "step": 1143 }, { "epoch": 0.8623792697290931, "grad_norm": 7.802835464477539, "learning_rate": 9.37762408937729e-06, "loss": 0.424, "step": 1144 }, { "epoch": 0.863133097762073, "grad_norm": 8.21778678894043, "learning_rate": 9.277027012004125e-06, "loss": 0.4752, "step": 1145 }, { "epoch": 0.863886925795053, "grad_norm": 8.805744171142578, "learning_rate": 9.176946167036516e-06, "loss": 0.4736, "step": 1146 }, { "epoch": 0.864640753828033, "grad_norm": 10.24565601348877, "learning_rate": 9.07738212395508e-06, "loss": 0.4635, "step": 1147 }, { "epoch": 0.8653945818610129, "grad_norm": 9.218001365661621, "learning_rate": 8.978335449299791e-06, "loss": 0.4313, "step": 1148 }, { "epoch": 0.8661484098939929, "grad_norm": 10.276748657226562, "learning_rate": 8.87980670666655e-06, "loss": 0.421, "step": 1149 }, { "epoch": 0.866902237926973, "grad_norm": 11.982145309448242, "learning_rate": 8.781796456704262e-06, "loss": 0.4486, "step": 1150 }, { "epoch": 0.8676560659599529, "grad_norm": 5.365624904632568, "learning_rate": 8.684305257111425e-06, "loss": 0.5014, "step": 1151 }, { "epoch": 0.8684098939929329, "grad_norm": 5.599196910858154, "learning_rate": 8.587333662633035e-06, "loss": 0.4984, "step": 1152 }, { "epoch": 0.8691637220259129, "grad_norm": 5.679477214813232, "learning_rate": 8.490882225057428e-06, "loss": 0.5011, "step": 1153 }, { "epoch": 0.8699175500588928, "grad_norm": 5.679898738861084, "learning_rate": 8.39495149321322e-06, "loss": 0.443, "step": 1154 }, { "epoch": 0.8706713780918728, "grad_norm": 5.414709091186523, "learning_rate": 8.299542012965944e-06, "loss": 0.4269, "step": 1155 }, { "epoch": 0.8714252061248527, "grad_norm": 5.3179426193237305, "learning_rate": 8.204654327215267e-06, "loss": 0.4395, "step": 1156 }, { "epoch": 0.8721790341578327, "grad_norm": 5.2444963455200195, "learning_rate": 8.110288975891634e-06, "loss": 0.4217, "step": 1157 }, { "epoch": 0.8729328621908127, "grad_norm": 5.733283996582031, "learning_rate": 8.016446495953367e-06, "loss": 0.4395, "step": 1158 }, { "epoch": 0.8736866902237928, "grad_norm": 5.545217037200928, "learning_rate": 7.923127421383458e-06, "loss": 0.436, "step": 1159 }, { "epoch": 0.8744405182567727, "grad_norm": 5.759894371032715, "learning_rate": 7.830332283186714e-06, "loss": 0.4376, "step": 1160 }, { "epoch": 0.8751943462897527, "grad_norm": 5.31406831741333, "learning_rate": 7.73806160938656e-06, "loss": 0.4097, "step": 1161 }, { "epoch": 0.8759481743227326, "grad_norm": 5.372743129730225, "learning_rate": 7.646315925022152e-06, "loss": 0.4264, "step": 1162 }, { "epoch": 0.8767020023557126, "grad_norm": 5.223913192749023, "learning_rate": 7.555095752145313e-06, "loss": 0.3879, "step": 1163 }, { "epoch": 0.8774558303886926, "grad_norm": 5.493069171905518, "learning_rate": 7.4644016098176615e-06, "loss": 0.4099, "step": 1164 }, { "epoch": 0.8782096584216725, "grad_norm": 5.413908004760742, "learning_rate": 7.374234014107484e-06, "loss": 0.4041, "step": 1165 }, { "epoch": 0.8789634864546525, "grad_norm": 5.9703288078308105, "learning_rate": 7.284593478087043e-06, "loss": 0.4391, "step": 1166 }, { "epoch": 0.8797173144876325, "grad_norm": 6.033265590667725, "learning_rate": 7.195480511829411e-06, "loss": 0.4356, "step": 1167 }, { "epoch": 0.8804711425206125, "grad_norm": 5.589619159698486, "learning_rate": 7.106895622405752e-06, "loss": 0.4029, "step": 1168 }, { "epoch": 0.8812249705535925, "grad_norm": 5.580582141876221, "learning_rate": 7.018839313882286e-06, "loss": 0.4039, "step": 1169 }, { "epoch": 0.8819787985865725, "grad_norm": 5.605942726135254, "learning_rate": 6.931312087317632e-06, "loss": 0.3915, "step": 1170 }, { "epoch": 0.8827326266195524, "grad_norm": 5.954355239868164, "learning_rate": 6.844314440759647e-06, "loss": 0.4119, "step": 1171 }, { "epoch": 0.8834864546525324, "grad_norm": 5.943442344665527, "learning_rate": 6.7578468692429345e-06, "loss": 0.4227, "step": 1172 }, { "epoch": 0.8842402826855124, "grad_norm": 6.070568561553955, "learning_rate": 6.6719098647857525e-06, "loss": 0.3824, "step": 1173 }, { "epoch": 0.8849941107184923, "grad_norm": 5.827738285064697, "learning_rate": 6.586503916387366e-06, "loss": 0.4358, "step": 1174 }, { "epoch": 0.8857479387514723, "grad_norm": 5.9503655433654785, "learning_rate": 6.501629510025231e-06, "loss": 0.3862, "step": 1175 }, { "epoch": 0.8865017667844522, "grad_norm": 5.86431360244751, "learning_rate": 6.417287128652172e-06, "loss": 0.3849, "step": 1176 }, { "epoch": 0.8872555948174323, "grad_norm": 5.833621978759766, "learning_rate": 6.333477252193731e-06, "loss": 0.3935, "step": 1177 }, { "epoch": 0.8880094228504123, "grad_norm": 6.094554901123047, "learning_rate": 6.250200357545377e-06, "loss": 0.3911, "step": 1178 }, { "epoch": 0.8887632508833923, "grad_norm": 5.814612865447998, "learning_rate": 6.167456918569792e-06, "loss": 0.3738, "step": 1179 }, { "epoch": 0.8895170789163722, "grad_norm": 6.395360946655273, "learning_rate": 6.085247406094197e-06, "loss": 0.3692, "step": 1180 }, { "epoch": 0.8902709069493522, "grad_norm": 5.914385795593262, "learning_rate": 6.003572287907633e-06, "loss": 0.4008, "step": 1181 }, { "epoch": 0.8910247349823321, "grad_norm": 6.416135787963867, "learning_rate": 5.922432028758362e-06, "loss": 0.3997, "step": 1182 }, { "epoch": 0.8917785630153121, "grad_norm": 5.680757522583008, "learning_rate": 5.841827090351171e-06, "loss": 0.347, "step": 1183 }, { "epoch": 0.8925323910482921, "grad_norm": 5.837109088897705, "learning_rate": 5.761757931344758e-06, "loss": 0.3623, "step": 1184 }, { "epoch": 0.893286219081272, "grad_norm": 5.914787769317627, "learning_rate": 5.68222500734914e-06, "loss": 0.3632, "step": 1185 }, { "epoch": 0.8940400471142521, "grad_norm": 6.179137229919434, "learning_rate": 5.603228770923041e-06, "loss": 0.3864, "step": 1186 }, { "epoch": 0.8947938751472321, "grad_norm": 5.854869365692139, "learning_rate": 5.524769671571317e-06, "loss": 0.3318, "step": 1187 }, { "epoch": 0.895547703180212, "grad_norm": 6.880571365356445, "learning_rate": 5.446848155742401e-06, "loss": 0.4063, "step": 1188 }, { "epoch": 0.896301531213192, "grad_norm": 6.602806568145752, "learning_rate": 5.3694646668257855e-06, "loss": 0.3698, "step": 1189 }, { "epoch": 0.897055359246172, "grad_norm": 7.17775821685791, "learning_rate": 5.292619645149433e-06, "loss": 0.4266, "step": 1190 }, { "epoch": 0.8978091872791519, "grad_norm": 7.022253036499023, "learning_rate": 5.2163135279773904e-06, "loss": 0.3885, "step": 1191 }, { "epoch": 0.8985630153121319, "grad_norm": 7.834957599639893, "learning_rate": 5.140546749507136e-06, "loss": 0.4484, "step": 1192 }, { "epoch": 0.8993168433451119, "grad_norm": 8.505350112915039, "learning_rate": 5.06531974086728e-06, "loss": 0.4535, "step": 1193 }, { "epoch": 0.9000706713780918, "grad_norm": 8.074254035949707, "learning_rate": 4.9906329301149914e-06, "loss": 0.4528, "step": 1194 }, { "epoch": 0.9008244994110719, "grad_norm": 8.195548057556152, "learning_rate": 4.916486742233606e-06, "loss": 0.447, "step": 1195 }, { "epoch": 0.9015783274440519, "grad_norm": 9.020340919494629, "learning_rate": 4.8428815991302005e-06, "loss": 0.4507, "step": 1196 }, { "epoch": 0.9023321554770318, "grad_norm": 9.480902671813965, "learning_rate": 4.769817919633235e-06, "loss": 0.4905, "step": 1197 }, { "epoch": 0.9030859835100118, "grad_norm": 9.953953742980957, "learning_rate": 4.697296119490047e-06, "loss": 0.4291, "step": 1198 }, { "epoch": 0.9038398115429918, "grad_norm": 9.974310874938965, "learning_rate": 4.625316611364661e-06, "loss": 0.4283, "step": 1199 }, { "epoch": 0.9045936395759717, "grad_norm": 12.497854232788086, "learning_rate": 4.553879804835282e-06, "loss": 0.4614, "step": 1200 }, { "epoch": 0.9053474676089517, "grad_norm": 4.8798136711120605, "learning_rate": 4.482986106392073e-06, "loss": 0.4771, "step": 1201 }, { "epoch": 0.9061012956419316, "grad_norm": 4.956184387207031, "learning_rate": 4.412635919434749e-06, "loss": 0.4444, "step": 1202 }, { "epoch": 0.9068551236749116, "grad_norm": 5.346173286437988, "learning_rate": 4.342829644270429e-06, "loss": 0.4442, "step": 1203 }, { "epoch": 0.9076089517078917, "grad_norm": 5.293701648712158, "learning_rate": 4.273567678111123e-06, "loss": 0.4614, "step": 1204 }, { "epoch": 0.9083627797408717, "grad_norm": 5.237243175506592, "learning_rate": 4.204850415071748e-06, "loss": 0.4512, "step": 1205 }, { "epoch": 0.9091166077738516, "grad_norm": 5.3798604011535645, "learning_rate": 4.136678246167636e-06, "loss": 0.4286, "step": 1206 }, { "epoch": 0.9098704358068316, "grad_norm": 5.367835998535156, "learning_rate": 4.069051559312531e-06, "loss": 0.4139, "step": 1207 }, { "epoch": 0.9106242638398115, "grad_norm": 5.50463342666626, "learning_rate": 4.001970739316163e-06, "loss": 0.4407, "step": 1208 }, { "epoch": 0.9113780918727915, "grad_norm": 5.295793056488037, "learning_rate": 3.935436167882234e-06, "loss": 0.418, "step": 1209 }, { "epoch": 0.9121319199057715, "grad_norm": 5.284564018249512, "learning_rate": 3.869448223606165e-06, "loss": 0.4096, "step": 1210 }, { "epoch": 0.9128857479387514, "grad_norm": 5.553956031799316, "learning_rate": 3.8040072819729545e-06, "loss": 0.4141, "step": 1211 }, { "epoch": 0.9136395759717314, "grad_norm": 5.626007080078125, "learning_rate": 3.7391137153550137e-06, "loss": 0.4138, "step": 1212 }, { "epoch": 0.9143934040047115, "grad_norm": 5.603013038635254, "learning_rate": 3.6747678930101558e-06, "loss": 0.4148, "step": 1213 }, { "epoch": 0.9151472320376914, "grad_norm": 5.539734363555908, "learning_rate": 3.6109701810793208e-06, "loss": 0.4181, "step": 1214 }, { "epoch": 0.9159010600706714, "grad_norm": 5.379584789276123, "learning_rate": 3.5477209425846538e-06, "loss": 0.4015, "step": 1215 }, { "epoch": 0.9166548881036514, "grad_norm": 5.433023929595947, "learning_rate": 3.4850205374273416e-06, "loss": 0.398, "step": 1216 }, { "epoch": 0.9174087161366313, "grad_norm": 5.5849199295043945, "learning_rate": 3.4228693223856136e-06, "loss": 0.4165, "step": 1217 }, { "epoch": 0.9181625441696113, "grad_norm": 5.703511714935303, "learning_rate": 3.361267651112676e-06, "loss": 0.422, "step": 1218 }, { "epoch": 0.9189163722025913, "grad_norm": 5.733764171600342, "learning_rate": 3.30021587413476e-06, "loss": 0.4017, "step": 1219 }, { "epoch": 0.9196702002355712, "grad_norm": 5.802048206329346, "learning_rate": 3.2397143388489983e-06, "loss": 0.3935, "step": 1220 }, { "epoch": 0.9204240282685512, "grad_norm": 5.458968639373779, "learning_rate": 3.1797633895216394e-06, "loss": 0.3783, "step": 1221 }, { "epoch": 0.9211778563015313, "grad_norm": 5.353023052215576, "learning_rate": 3.120363367285917e-06, "loss": 0.3788, "step": 1222 }, { "epoch": 0.9219316843345112, "grad_norm": 5.518474578857422, "learning_rate": 3.0615146101401925e-06, "loss": 0.3944, "step": 1223 }, { "epoch": 0.9226855123674912, "grad_norm": 5.713134765625, "learning_rate": 3.0032174529460165e-06, "loss": 0.3953, "step": 1224 }, { "epoch": 0.9234393404004712, "grad_norm": 6.142655372619629, "learning_rate": 2.945472227426227e-06, "loss": 0.4168, "step": 1225 }, { "epoch": 0.9241931684334511, "grad_norm": 5.580604553222656, "learning_rate": 2.8882792621630406e-06, "loss": 0.3642, "step": 1226 }, { "epoch": 0.9249469964664311, "grad_norm": 5.7619757652282715, "learning_rate": 2.8316388825962324e-06, "loss": 0.3708, "step": 1227 }, { "epoch": 0.925700824499411, "grad_norm": 6.232563018798828, "learning_rate": 2.7755514110212264e-06, "loss": 0.4063, "step": 1228 }, { "epoch": 0.926454652532391, "grad_norm": 5.895346164703369, "learning_rate": 2.7200171665872742e-06, "loss": 0.399, "step": 1229 }, { "epoch": 0.927208480565371, "grad_norm": 5.760490894317627, "learning_rate": 2.6650364652956894e-06, "loss": 0.3785, "step": 1230 }, { "epoch": 0.927962308598351, "grad_norm": 5.620173454284668, "learning_rate": 2.6106096199979614e-06, "loss": 0.3564, "step": 1231 }, { "epoch": 0.928716136631331, "grad_norm": 5.84246826171875, "learning_rate": 2.5567369403940776e-06, "loss": 0.3575, "step": 1232 }, { "epoch": 0.929469964664311, "grad_norm": 5.908325672149658, "learning_rate": 2.50341873303066e-06, "loss": 0.384, "step": 1233 }, { "epoch": 0.9302237926972909, "grad_norm": 5.850981712341309, "learning_rate": 2.4506553012993093e-06, "loss": 0.3704, "step": 1234 }, { "epoch": 0.9309776207302709, "grad_norm": 6.301943778991699, "learning_rate": 2.398446945434818e-06, "loss": 0.385, "step": 1235 }, { "epoch": 0.9317314487632509, "grad_norm": 6.557477951049805, "learning_rate": 2.346793962513483e-06, "loss": 0.3607, "step": 1236 }, { "epoch": 0.9324852767962308, "grad_norm": 6.442347049713135, "learning_rate": 2.2956966464514175e-06, "loss": 0.3829, "step": 1237 }, { "epoch": 0.9332391048292108, "grad_norm": 7.224841594696045, "learning_rate": 2.245155288002876e-06, "loss": 0.3964, "step": 1238 }, { "epoch": 0.9339929328621908, "grad_norm": 7.129518032073975, "learning_rate": 2.1951701747585982e-06, "loss": 0.3682, "step": 1239 }, { "epoch": 0.9347467608951708, "grad_norm": 6.685035228729248, "learning_rate": 2.1457415911442013e-06, "loss": 0.4049, "step": 1240 }, { "epoch": 0.9355005889281508, "grad_norm": 7.421708583831787, "learning_rate": 2.0968698184184565e-06, "loss": 0.4029, "step": 1241 }, { "epoch": 0.9362544169611308, "grad_norm": 7.260560989379883, "learning_rate": 2.04855513467187e-06, "loss": 0.4232, "step": 1242 }, { "epoch": 0.9370082449941107, "grad_norm": 8.069437980651855, "learning_rate": 2.000797814824906e-06, "loss": 0.4409, "step": 1243 }, { "epoch": 0.9377620730270907, "grad_norm": 7.945827960968018, "learning_rate": 1.9535981306265884e-06, "loss": 0.4244, "step": 1244 }, { "epoch": 0.9385159010600707, "grad_norm": 8.818882942199707, "learning_rate": 1.9069563506527998e-06, "loss": 0.4722, "step": 1245 }, { "epoch": 0.9392697290930506, "grad_norm": 8.6805419921875, "learning_rate": 1.8608727403049309e-06, "loss": 0.4574, "step": 1246 }, { "epoch": 0.9400235571260306, "grad_norm": 8.550375938415527, "learning_rate": 1.8153475618081673e-06, "loss": 0.4289, "step": 1247 }, { "epoch": 0.9407773851590105, "grad_norm": 9.816337585449219, "learning_rate": 1.7703810742101813e-06, "loss": 0.4884, "step": 1248 }, { "epoch": 0.9415312131919906, "grad_norm": 9.228532791137695, "learning_rate": 1.7259735333795545e-06, "loss": 0.4282, "step": 1249 }, { "epoch": 0.9422850412249706, "grad_norm": 12.300414085388184, "learning_rate": 1.6821251920043246e-06, "loss": 0.4527, "step": 1250 }, { "epoch": 0.9430388692579506, "grad_norm": 5.250865459442139, "learning_rate": 1.6388362995905848e-06, "loss": 0.509, "step": 1251 }, { "epoch": 0.9437926972909305, "grad_norm": 5.213113307952881, "learning_rate": 1.5961071024610752e-06, "loss": 0.4615, "step": 1252 }, { "epoch": 0.9445465253239105, "grad_norm": 5.200348377227783, "learning_rate": 1.5539378437536944e-06, "loss": 0.4463, "step": 1253 }, { "epoch": 0.9453003533568904, "grad_norm": 5.2860941886901855, "learning_rate": 1.5123287634202454e-06, "loss": 0.4441, "step": 1254 }, { "epoch": 0.9460541813898704, "grad_norm": 5.183274269104004, "learning_rate": 1.4712800982249474e-06, "loss": 0.4292, "step": 1255 }, { "epoch": 0.9468080094228504, "grad_norm": 5.593634605407715, "learning_rate": 1.430792081743182e-06, "loss": 0.4589, "step": 1256 }, { "epoch": 0.9475618374558303, "grad_norm": 5.3267388343811035, "learning_rate": 1.3908649443600707e-06, "loss": 0.4336, "step": 1257 }, { "epoch": 0.9483156654888104, "grad_norm": 5.741166114807129, "learning_rate": 1.351498913269289e-06, "loss": 0.4008, "step": 1258 }, { "epoch": 0.9490694935217904, "grad_norm": 5.336604118347168, "learning_rate": 1.3126942124716213e-06, "loss": 0.4218, "step": 1259 }, { "epoch": 0.9498233215547703, "grad_norm": 5.611804962158203, "learning_rate": 1.2744510627738516e-06, "loss": 0.4434, "step": 1260 }, { "epoch": 0.9505771495877503, "grad_norm": 5.724870204925537, "learning_rate": 1.2367696817873419e-06, "loss": 0.4227, "step": 1261 }, { "epoch": 0.9513309776207303, "grad_norm": 5.307777404785156, "learning_rate": 1.1996502839269453e-06, "loss": 0.4002, "step": 1262 }, { "epoch": 0.9520848056537102, "grad_norm": 5.79971170425415, "learning_rate": 1.1630930804096495e-06, "loss": 0.405, "step": 1263 }, { "epoch": 0.9528386336866902, "grad_norm": 5.324243068695068, "learning_rate": 1.127098279253491e-06, "loss": 0.4043, "step": 1264 }, { "epoch": 0.9535924617196702, "grad_norm": 5.532378673553467, "learning_rate": 1.0916660852763216e-06, "loss": 0.4068, "step": 1265 }, { "epoch": 0.9543462897526501, "grad_norm": 5.695662021636963, "learning_rate": 1.0567967000945866e-06, "loss": 0.4286, "step": 1266 }, { "epoch": 0.9551001177856302, "grad_norm": 5.8561482429504395, "learning_rate": 1.0224903221222938e-06, "loss": 0.4249, "step": 1267 }, { "epoch": 0.9558539458186102, "grad_norm": 5.72511625289917, "learning_rate": 9.88747146569813e-07, "loss": 0.4021, "step": 1268 }, { "epoch": 0.9566077738515901, "grad_norm": 5.385478973388672, "learning_rate": 9.555673654427332e-07, "loss": 0.3788, "step": 1269 }, { "epoch": 0.9573616018845701, "grad_norm": 5.669264316558838, "learning_rate": 9.229511675408642e-07, "loss": 0.4148, "step": 1270 }, { "epoch": 0.95811542991755, "grad_norm": 5.313277244567871, "learning_rate": 8.90898738457091e-07, "loss": 0.3641, "step": 1271 }, { "epoch": 0.95886925795053, "grad_norm": 5.480482578277588, "learning_rate": 8.59410260576321e-07, "loss": 0.3971, "step": 1272 }, { "epoch": 0.95962308598351, "grad_norm": 5.8209757804870605, "learning_rate": 8.28485913074506e-07, "loss": 0.3919, "step": 1273 }, { "epoch": 0.96037691401649, "grad_norm": 5.919877052307129, "learning_rate": 7.981258719175322e-07, "loss": 0.3863, "step": 1274 }, { "epoch": 0.9611307420494699, "grad_norm": 5.9404144287109375, "learning_rate": 7.683303098602989e-07, "loss": 0.4059, "step": 1275 }, { "epoch": 0.96188457008245, "grad_norm": 5.609850883483887, "learning_rate": 7.39099396445686e-07, "loss": 0.3697, "step": 1276 }, { "epoch": 0.96263839811543, "grad_norm": 5.695891857147217, "learning_rate": 7.104332980036211e-07, "loss": 0.3917, "step": 1277 }, { "epoch": 0.9633922261484099, "grad_norm": 5.932850360870361, "learning_rate": 6.823321776501024e-07, "loss": 0.415, "step": 1278 }, { "epoch": 0.9641460541813899, "grad_norm": 6.023778438568115, "learning_rate": 6.547961952863002e-07, "loss": 0.3817, "step": 1279 }, { "epoch": 0.9648998822143698, "grad_norm": 5.926705360412598, "learning_rate": 6.278255075976125e-07, "loss": 0.3884, "step": 1280 }, { "epoch": 0.9656537102473498, "grad_norm": 5.837738513946533, "learning_rate": 6.014202680528324e-07, "loss": 0.3598, "step": 1281 }, { "epoch": 0.9664075382803298, "grad_norm": 6.178413391113281, "learning_rate": 5.755806269031827e-07, "loss": 0.3917, "step": 1282 }, { "epoch": 0.9671613663133097, "grad_norm": 6.282332897186279, "learning_rate": 5.503067311815713e-07, "loss": 0.4286, "step": 1283 }, { "epoch": 0.9679151943462897, "grad_norm": 6.746578216552734, "learning_rate": 5.255987247016591e-07, "loss": 0.4118, "step": 1284 }, { "epoch": 0.9686690223792698, "grad_norm": 6.075422763824463, "learning_rate": 5.014567480570831e-07, "loss": 0.3829, "step": 1285 }, { "epoch": 0.9694228504122497, "grad_norm": 6.149974346160889, "learning_rate": 4.778809386206895e-07, "loss": 0.3847, "step": 1286 }, { "epoch": 0.9701766784452297, "grad_norm": 6.333911418914795, "learning_rate": 4.548714305436685e-07, "loss": 0.3638, "step": 1287 }, { "epoch": 0.9709305064782097, "grad_norm": 6.391441345214844, "learning_rate": 4.324283547548658e-07, "loss": 0.3893, "step": 1288 }, { "epoch": 0.9716843345111896, "grad_norm": 6.624934196472168, "learning_rate": 4.1055183896001606e-07, "loss": 0.378, "step": 1289 }, { "epoch": 0.9724381625441696, "grad_norm": 6.473977565765381, "learning_rate": 3.892420076409886e-07, "loss": 0.366, "step": 1290 }, { "epoch": 0.9731919905771496, "grad_norm": 6.985432147979736, "learning_rate": 3.68498982055121e-07, "loss": 0.4335, "step": 1291 }, { "epoch": 0.9739458186101295, "grad_norm": 7.089210510253906, "learning_rate": 3.483228802344973e-07, "loss": 0.4066, "step": 1292 }, { "epoch": 0.9746996466431095, "grad_norm": 7.46934175491333, "learning_rate": 3.2871381698529324e-07, "loss": 0.4253, "step": 1293 }, { "epoch": 0.9754534746760896, "grad_norm": 8.461312294006348, "learning_rate": 3.0967190388712097e-07, "loss": 0.4596, "step": 1294 }, { "epoch": 0.9762073027090695, "grad_norm": 8.289325714111328, "learning_rate": 2.9119724929239645e-07, "loss": 0.4382, "step": 1295 }, { "epoch": 0.9769611307420495, "grad_norm": 8.890064239501953, "learning_rate": 2.7328995832568426e-07, "loss": 0.4469, "step": 1296 }, { "epoch": 0.9777149587750295, "grad_norm": 8.737083435058594, "learning_rate": 2.5595013288318703e-07, "loss": 0.4262, "step": 1297 }, { "epoch": 0.9784687868080094, "grad_norm": 9.281461715698242, "learning_rate": 2.391778716320792e-07, "loss": 0.4036, "step": 1298 }, { "epoch": 0.9792226148409894, "grad_norm": 9.91952896118164, "learning_rate": 2.2297327000996293e-07, "loss": 0.4469, "step": 1299 }, { "epoch": 0.9799764428739693, "grad_norm": 11.952555656433105, "learning_rate": 2.0733642022437994e-07, "loss": 0.4597, "step": 1300 }, { "epoch": 0.9807302709069493, "grad_norm": 5.1298322677612305, "learning_rate": 1.922674112522227e-07, "loss": 0.478, "step": 1301 }, { "epoch": 0.9814840989399294, "grad_norm": 5.572525501251221, "learning_rate": 1.7776632883924615e-07, "loss": 0.4829, "step": 1302 }, { "epoch": 0.9822379269729093, "grad_norm": 5.343718528747559, "learning_rate": 1.638332554996125e-07, "loss": 0.4319, "step": 1303 }, { "epoch": 0.9829917550058893, "grad_norm": 5.716027736663818, "learning_rate": 1.5046827051536928e-07, "loss": 0.4378, "step": 1304 }, { "epoch": 0.9837455830388693, "grad_norm": 5.513693809509277, "learning_rate": 1.3767144993602766e-07, "loss": 0.4235, "step": 1305 }, { "epoch": 0.9844994110718492, "grad_norm": 5.508944988250732, "learning_rate": 1.254428665781515e-07, "loss": 0.4007, "step": 1306 }, { "epoch": 0.9852532391048292, "grad_norm": 5.180131435394287, "learning_rate": 1.1378259002488013e-07, "loss": 0.3939, "step": 1307 }, { "epoch": 0.9860070671378092, "grad_norm": 5.590184688568115, "learning_rate": 1.0269068662560611e-07, "loss": 0.4166, "step": 1308 }, { "epoch": 0.9867608951707891, "grad_norm": 5.44436502456665, "learning_rate": 9.216721949553142e-08, "loss": 0.4047, "step": 1309 }, { "epoch": 0.9875147232037691, "grad_norm": 5.489165782928467, "learning_rate": 8.221224851535647e-08, "loss": 0.3999, "step": 1310 }, { "epoch": 0.9882685512367492, "grad_norm": 5.663797855377197, "learning_rate": 7.282583033091372e-08, "loss": 0.3842, "step": 1311 }, { "epoch": 0.9890223792697291, "grad_norm": 5.638896942138672, "learning_rate": 6.400801835286796e-08, "loss": 0.3977, "step": 1312 }, { "epoch": 0.9897762073027091, "grad_norm": 5.8632307052612305, "learning_rate": 5.57588627563721e-08, "loss": 0.3579, "step": 1313 }, { "epoch": 0.9905300353356891, "grad_norm": 5.826532363891602, "learning_rate": 4.807841048082296e-08, "loss": 0.4088, "step": 1314 }, { "epoch": 0.991283863368669, "grad_norm": 5.712516784667969, "learning_rate": 4.096670522959478e-08, "loss": 0.3853, "step": 1315 }, { "epoch": 0.992037691401649, "grad_norm": 6.0777459144592285, "learning_rate": 3.442378746972841e-08, "loss": 0.4111, "step": 1316 }, { "epoch": 0.992791519434629, "grad_norm": 5.916062831878662, "learning_rate": 2.844969443178691e-08, "loss": 0.3821, "step": 1317 }, { "epoch": 0.9935453474676089, "grad_norm": 5.911341190338135, "learning_rate": 2.304446010958916e-08, "loss": 0.38, "step": 1318 }, { "epoch": 0.9942991755005889, "grad_norm": 6.334498405456543, "learning_rate": 1.8208115260032187e-08, "loss": 0.3812, "step": 1319 }, { "epoch": 0.995053003533569, "grad_norm": 6.576707363128662, "learning_rate": 1.3940687402924646e-08, "loss": 0.3858, "step": 1320 }, { "epoch": 0.9958068315665489, "grad_norm": 6.39242696762085, "learning_rate": 1.0242200820786974e-08, "loss": 0.3661, "step": 1321 }, { "epoch": 0.9965606595995289, "grad_norm": 7.869157791137695, "learning_rate": 7.112676558784781e-09, "loss": 0.3966, "step": 1322 }, { "epoch": 0.9973144876325089, "grad_norm": 7.689291954040527, "learning_rate": 4.552132424562317e-09, "loss": 0.4297, "step": 1323 }, { "epoch": 0.9980683156654888, "grad_norm": 8.572519302368164, "learning_rate": 2.5605829881203414e-09, "loss": 0.451, "step": 1324 }, { "epoch": 0.9988221436984688, "grad_norm": 9.072525024414062, "learning_rate": 1.1380395818050282e-09, "loss": 0.4373, "step": 1325 }, { "epoch": 0.9995759717314487, "grad_norm": 9.224164962768555, "learning_rate": 2.8451030018583623e-10, "loss": 0.4368, "step": 1326 }, { "epoch": 1.0007067137809187, "grad_norm": 5.4062819480896, "learning_rate": 0.0, "loss": 0.4839, "step": 1327 } ], "logging_steps": 1, "max_steps": 1327, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 332, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.8606342447625667e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }