{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9994239157271007, "eval_steps": 380, "global_step": 1518, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006583820261706855, "grad_norm": 0.26480913162231445, "learning_rate": 4.000000000000001e-06, "loss": 1.1899, "step": 1 }, { "epoch": 0.0006583820261706855, "eval_loss": 1.8996797800064087, "eval_runtime": 142.7165, "eval_samples_per_second": 17.924, "eval_steps_per_second": 4.484, "step": 1 }, { "epoch": 0.001316764052341371, "grad_norm": 0.3458176255226135, "learning_rate": 8.000000000000001e-06, "loss": 1.3847, "step": 2 }, { "epoch": 0.0019751460785120568, "grad_norm": 0.3616096079349518, "learning_rate": 1.2e-05, "loss": 1.3734, "step": 3 }, { "epoch": 0.002633528104682742, "grad_norm": 0.4223955571651459, "learning_rate": 1.6000000000000003e-05, "loss": 1.4578, "step": 4 }, { "epoch": 0.003291910130853428, "grad_norm": 0.4208202362060547, "learning_rate": 2e-05, "loss": 1.4779, "step": 5 }, { "epoch": 0.0039502921570241135, "grad_norm": 0.4851401448249817, "learning_rate": 2.4e-05, "loss": 1.4297, "step": 6 }, { "epoch": 0.004608674183194799, "grad_norm": 0.4874451160430908, "learning_rate": 2.8000000000000003e-05, "loss": 1.4767, "step": 7 }, { "epoch": 0.005267056209365484, "grad_norm": 0.5162709951400757, "learning_rate": 3.2000000000000005e-05, "loss": 1.4982, "step": 8 }, { "epoch": 0.0059254382355361694, "grad_norm": 0.5358583331108093, "learning_rate": 3.6e-05, "loss": 1.5077, "step": 9 }, { "epoch": 0.006583820261706856, "grad_norm": 0.5848316550254822, "learning_rate": 4e-05, "loss": 1.4564, "step": 10 }, { "epoch": 0.007242202287877541, "grad_norm": 0.649307131767273, "learning_rate": 4.4000000000000006e-05, "loss": 1.6126, "step": 11 }, { "epoch": 0.007900584314048227, "grad_norm": 0.721455991268158, "learning_rate": 4.8e-05, "loss": 1.4816, "step": 12 }, { "epoch": 0.008558966340218912, "grad_norm": 0.7559311389923096, "learning_rate": 5.2000000000000004e-05, "loss": 1.5509, "step": 13 }, { "epoch": 0.009217348366389598, "grad_norm": 0.7546976208686829, "learning_rate": 5.6000000000000006e-05, "loss": 1.5165, "step": 14 }, { "epoch": 0.009875730392560284, "grad_norm": 1.0660227537155151, "learning_rate": 6e-05, "loss": 1.6088, "step": 15 }, { "epoch": 0.010534112418730968, "grad_norm": 1.2203493118286133, "learning_rate": 6.400000000000001e-05, "loss": 1.7437, "step": 16 }, { "epoch": 0.011192494444901654, "grad_norm": 1.5237772464752197, "learning_rate": 6.800000000000001e-05, "loss": 1.7795, "step": 17 }, { "epoch": 0.011850876471072339, "grad_norm": 1.6246570348739624, "learning_rate": 7.2e-05, "loss": 1.9059, "step": 18 }, { "epoch": 0.012509258497243025, "grad_norm": 2.11099910736084, "learning_rate": 7.6e-05, "loss": 1.8841, "step": 19 }, { "epoch": 0.013167640523413711, "grad_norm": 2.276372194290161, "learning_rate": 8e-05, "loss": 1.8559, "step": 20 }, { "epoch": 0.013826022549584396, "grad_norm": 2.6171741485595703, "learning_rate": 8.4e-05, "loss": 1.8732, "step": 21 }, { "epoch": 0.014484404575755082, "grad_norm": 3.0389842987060547, "learning_rate": 8.800000000000001e-05, "loss": 2.2173, "step": 22 }, { "epoch": 0.015142786601925768, "grad_norm": 2.9564194679260254, "learning_rate": 9.200000000000001e-05, "loss": 2.3194, "step": 23 }, { "epoch": 0.015801168628096454, "grad_norm": 4.141776084899902, "learning_rate": 9.6e-05, "loss": 1.6703, "step": 24 }, { "epoch": 0.016459550654267137, "grad_norm": 4.746026039123535, "learning_rate": 0.0001, "loss": 2.4348, "step": 25 }, { "epoch": 0.017117932680437823, "grad_norm": 1.7773373126983643, "learning_rate": 0.00010400000000000001, "loss": 1.4459, "step": 26 }, { "epoch": 0.01777631470660851, "grad_norm": 2.0467586517333984, "learning_rate": 0.00010800000000000001, "loss": 1.5192, "step": 27 }, { "epoch": 0.018434696732779195, "grad_norm": 1.8465300798416138, "learning_rate": 0.00011200000000000001, "loss": 1.4189, "step": 28 }, { "epoch": 0.01909307875894988, "grad_norm": 1.5467618703842163, "learning_rate": 0.000116, "loss": 1.4587, "step": 29 }, { "epoch": 0.019751460785120568, "grad_norm": 1.040298342704773, "learning_rate": 0.00012, "loss": 1.3479, "step": 30 }, { "epoch": 0.02040984281129125, "grad_norm": 0.87656170129776, "learning_rate": 0.000124, "loss": 1.3608, "step": 31 }, { "epoch": 0.021068224837461937, "grad_norm": 0.6608215570449829, "learning_rate": 0.00012800000000000002, "loss": 1.2555, "step": 32 }, { "epoch": 0.021726606863632623, "grad_norm": 0.5476593375205994, "learning_rate": 0.000132, "loss": 1.4716, "step": 33 }, { "epoch": 0.02238498888980331, "grad_norm": 0.5305841565132141, "learning_rate": 0.00013600000000000003, "loss": 1.4462, "step": 34 }, { "epoch": 0.023043370915973995, "grad_norm": 0.5917577743530273, "learning_rate": 0.00014, "loss": 1.3469, "step": 35 }, { "epoch": 0.023701752942144678, "grad_norm": 0.6834359765052795, "learning_rate": 0.000144, "loss": 1.4932, "step": 36 }, { "epoch": 0.024360134968315364, "grad_norm": 0.6160683631896973, "learning_rate": 0.000148, "loss": 1.5475, "step": 37 }, { "epoch": 0.02501851699448605, "grad_norm": 0.6902714967727661, "learning_rate": 0.000152, "loss": 1.4997, "step": 38 }, { "epoch": 0.025676899020656736, "grad_norm": 0.7107637524604797, "learning_rate": 0.00015600000000000002, "loss": 1.4512, "step": 39 }, { "epoch": 0.026335281046827422, "grad_norm": 0.7167961001396179, "learning_rate": 0.00016, "loss": 1.3609, "step": 40 }, { "epoch": 0.02699366307299811, "grad_norm": 0.8186436891555786, "learning_rate": 0.000164, "loss": 1.4147, "step": 41 }, { "epoch": 0.02765204509916879, "grad_norm": 1.06589937210083, "learning_rate": 0.000168, "loss": 1.556, "step": 42 }, { "epoch": 0.028310427125339477, "grad_norm": 1.3348461389541626, "learning_rate": 0.000172, "loss": 1.4569, "step": 43 }, { "epoch": 0.028968809151510164, "grad_norm": 1.4606307744979858, "learning_rate": 0.00017600000000000002, "loss": 1.5029, "step": 44 }, { "epoch": 0.02962719117768085, "grad_norm": 1.5066500902175903, "learning_rate": 0.00018, "loss": 1.4565, "step": 45 }, { "epoch": 0.030285573203851536, "grad_norm": 1.8037623167037964, "learning_rate": 0.00018400000000000003, "loss": 1.3512, "step": 46 }, { "epoch": 0.03094395523002222, "grad_norm": 2.024911403656006, "learning_rate": 0.000188, "loss": 1.4498, "step": 47 }, { "epoch": 0.03160233725619291, "grad_norm": 2.5773561000823975, "learning_rate": 0.000192, "loss": 1.4423, "step": 48 }, { "epoch": 0.03226071928236359, "grad_norm": 2.2219419479370117, "learning_rate": 0.000196, "loss": 0.9938, "step": 49 }, { "epoch": 0.032919101308534274, "grad_norm": 2.548572301864624, "learning_rate": 0.0002, "loss": 1.1529, "step": 50 }, { "epoch": 0.03357748333470496, "grad_norm": 1.5913848876953125, "learning_rate": 0.0001999997710094961, "loss": 1.3615, "step": 51 }, { "epoch": 0.034235865360875646, "grad_norm": 1.4608334302902222, "learning_rate": 0.00019999908403903308, "loss": 1.387, "step": 52 }, { "epoch": 0.034894247387046336, "grad_norm": 1.1794966459274292, "learning_rate": 0.00019999793909175715, "loss": 1.4549, "step": 53 }, { "epoch": 0.03555262941321702, "grad_norm": 0.7661542892456055, "learning_rate": 0.00019999633617291194, "loss": 1.3391, "step": 54 }, { "epoch": 0.03621101143938771, "grad_norm": 0.5617344975471497, "learning_rate": 0.00019999427528983853, "loss": 1.2978, "step": 55 }, { "epoch": 0.03686939346555839, "grad_norm": 0.504490077495575, "learning_rate": 0.00019999175645197536, "loss": 1.2564, "step": 56 }, { "epoch": 0.03752777549172907, "grad_norm": 0.46205273270606995, "learning_rate": 0.00019998877967085824, "loss": 1.3083, "step": 57 }, { "epoch": 0.03818615751789976, "grad_norm": 0.4746592342853546, "learning_rate": 0.00019998534496012027, "loss": 1.3878, "step": 58 }, { "epoch": 0.038844539544070446, "grad_norm": 0.48164641857147217, "learning_rate": 0.00019998145233549174, "loss": 1.3217, "step": 59 }, { "epoch": 0.039502921570241135, "grad_norm": 0.5783324837684631, "learning_rate": 0.0001999771018148002, "loss": 1.4754, "step": 60 }, { "epoch": 0.04016130359641182, "grad_norm": 0.560674250125885, "learning_rate": 0.0001999722934179701, "loss": 1.4767, "step": 61 }, { "epoch": 0.0408196856225825, "grad_norm": 0.6083958148956299, "learning_rate": 0.00019996702716702308, "loss": 1.4167, "step": 62 }, { "epoch": 0.04147806764875319, "grad_norm": 0.6148172616958618, "learning_rate": 0.00019996130308607755, "loss": 1.3947, "step": 63 }, { "epoch": 0.04213644967492387, "grad_norm": 0.6950907707214355, "learning_rate": 0.00019995512120134866, "loss": 1.4744, "step": 64 }, { "epoch": 0.04279483170109456, "grad_norm": 0.6931608319282532, "learning_rate": 0.00019994848154114832, "loss": 1.3243, "step": 65 }, { "epoch": 0.043453213727265246, "grad_norm": 0.790777862071991, "learning_rate": 0.00019994138413588491, "loss": 1.5057, "step": 66 }, { "epoch": 0.04411159575343593, "grad_norm": 0.9626539349555969, "learning_rate": 0.0001999338290180632, "loss": 1.4716, "step": 67 }, { "epoch": 0.04476997777960662, "grad_norm": 1.1360085010528564, "learning_rate": 0.00019992581622228417, "loss": 1.3146, "step": 68 }, { "epoch": 0.0454283598057773, "grad_norm": 1.4142122268676758, "learning_rate": 0.00019991734578524487, "loss": 1.1835, "step": 69 }, { "epoch": 0.04608674183194799, "grad_norm": 1.3170647621154785, "learning_rate": 0.00019990841774573842, "loss": 1.3143, "step": 70 }, { "epoch": 0.04674512385811867, "grad_norm": 1.5642683506011963, "learning_rate": 0.00019989903214465339, "loss": 1.3561, "step": 71 }, { "epoch": 0.047403505884289356, "grad_norm": 1.6542617082595825, "learning_rate": 0.00019988918902497416, "loss": 1.3545, "step": 72 }, { "epoch": 0.048061887910460045, "grad_norm": 2.163010358810425, "learning_rate": 0.00019987888843178033, "loss": 1.1143, "step": 73 }, { "epoch": 0.04872026993663073, "grad_norm": 2.4503602981567383, "learning_rate": 0.0001998681304122466, "loss": 1.3482, "step": 74 }, { "epoch": 0.04937865196280142, "grad_norm": 3.2356019020080566, "learning_rate": 0.00019985691501564273, "loss": 1.3511, "step": 75 }, { "epoch": 0.0500370339889721, "grad_norm": 0.8084772229194641, "learning_rate": 0.00019984524229333306, "loss": 1.1252, "step": 76 }, { "epoch": 0.05069541601514279, "grad_norm": 0.8939741849899292, "learning_rate": 0.0001998331122987765, "loss": 1.3698, "step": 77 }, { "epoch": 0.05135379804131347, "grad_norm": 0.7804089188575745, "learning_rate": 0.00019982052508752603, "loss": 1.3625, "step": 78 }, { "epoch": 0.052012180067484155, "grad_norm": 0.5285259485244751, "learning_rate": 0.00019980748071722873, "loss": 1.2357, "step": 79 }, { "epoch": 0.052670562093654845, "grad_norm": 0.4449038505554199, "learning_rate": 0.00019979397924762535, "loss": 1.241, "step": 80 }, { "epoch": 0.05332894411982553, "grad_norm": 0.46149927377700806, "learning_rate": 0.00019978002074055008, "loss": 1.3408, "step": 81 }, { "epoch": 0.05398732614599622, "grad_norm": 0.44500094652175903, "learning_rate": 0.00019976560525993014, "loss": 1.2666, "step": 82 }, { "epoch": 0.0546457081721669, "grad_norm": 0.44872742891311646, "learning_rate": 0.0001997507328717858, "loss": 1.3706, "step": 83 }, { "epoch": 0.05530409019833758, "grad_norm": 0.44312259554862976, "learning_rate": 0.00019973540364422973, "loss": 1.2403, "step": 84 }, { "epoch": 0.05596247222450827, "grad_norm": 0.5274776816368103, "learning_rate": 0.0001997196176474669, "loss": 1.4539, "step": 85 }, { "epoch": 0.056620854250678955, "grad_norm": 0.5395448803901672, "learning_rate": 0.0001997033749537941, "loss": 1.3792, "step": 86 }, { "epoch": 0.057279236276849645, "grad_norm": 0.5877296924591064, "learning_rate": 0.0001996866756375999, "loss": 1.479, "step": 87 }, { "epoch": 0.05793761830302033, "grad_norm": 0.6244651675224304, "learning_rate": 0.00019966951977536387, "loss": 1.356, "step": 88 }, { "epoch": 0.05859600032919101, "grad_norm": 0.6435254216194153, "learning_rate": 0.00019965190744565675, "loss": 1.2463, "step": 89 }, { "epoch": 0.0592543823553617, "grad_norm": 0.8112192749977112, "learning_rate": 0.0001996338387291395, "loss": 1.4228, "step": 90 }, { "epoch": 0.05991276438153238, "grad_norm": 1.036713719367981, "learning_rate": 0.0001996153137085635, "loss": 1.4201, "step": 91 }, { "epoch": 0.06057114640770307, "grad_norm": 0.9899868965148926, "learning_rate": 0.00019959633246876987, "loss": 1.3296, "step": 92 }, { "epoch": 0.061229528433873755, "grad_norm": 1.1252297163009644, "learning_rate": 0.00019957689509668896, "loss": 1.3326, "step": 93 }, { "epoch": 0.06188791046004444, "grad_norm": 1.2600111961364746, "learning_rate": 0.0001995570016813404, "loss": 1.2094, "step": 94 }, { "epoch": 0.06254629248621513, "grad_norm": 1.5766092538833618, "learning_rate": 0.0001995366523138321, "loss": 1.5203, "step": 95 }, { "epoch": 0.06320467451238582, "grad_norm": 1.6614958047866821, "learning_rate": 0.00019951584708736037, "loss": 1.121, "step": 96 }, { "epoch": 0.06386305653855649, "grad_norm": 1.6387184858322144, "learning_rate": 0.00019949458609720922, "loss": 1.3053, "step": 97 }, { "epoch": 0.06452143856472718, "grad_norm": 1.6963297128677368, "learning_rate": 0.0001994728694407499, "loss": 1.1357, "step": 98 }, { "epoch": 0.06517982059089787, "grad_norm": 1.8991777896881104, "learning_rate": 0.0001994506972174406, "loss": 1.0274, "step": 99 }, { "epoch": 0.06583820261706855, "grad_norm": 2.750582218170166, "learning_rate": 0.00019942806952882584, "loss": 1.5686, "step": 100 }, { "epoch": 0.06649658464323924, "grad_norm": 0.3716993033885956, "learning_rate": 0.00019940498647853623, "loss": 1.2786, "step": 101 }, { "epoch": 0.06715496666940993, "grad_norm": 0.42230430245399475, "learning_rate": 0.0001993814481722877, "loss": 1.2765, "step": 102 }, { "epoch": 0.06781334869558062, "grad_norm": 0.41818639636039734, "learning_rate": 0.00019935745471788118, "loss": 1.2753, "step": 103 }, { "epoch": 0.06847173072175129, "grad_norm": 0.4580991566181183, "learning_rate": 0.00019933300622520224, "loss": 1.3207, "step": 104 }, { "epoch": 0.06913011274792198, "grad_norm": 0.46809226274490356, "learning_rate": 0.00019930810280622025, "loss": 1.1977, "step": 105 }, { "epoch": 0.06978849477409267, "grad_norm": 0.4010414183139801, "learning_rate": 0.00019928274457498818, "loss": 1.2758, "step": 106 }, { "epoch": 0.07044687680026335, "grad_norm": 0.39565059542655945, "learning_rate": 0.00019925693164764185, "loss": 1.2257, "step": 107 }, { "epoch": 0.07110525882643404, "grad_norm": 0.4842902421951294, "learning_rate": 0.00019923066414239963, "loss": 1.3928, "step": 108 }, { "epoch": 0.07176364085260473, "grad_norm": 0.4858724772930145, "learning_rate": 0.0001992039421795617, "loss": 1.5182, "step": 109 }, { "epoch": 0.07242202287877542, "grad_norm": 0.5046135187149048, "learning_rate": 0.00019917676588150957, "loss": 1.4308, "step": 110 }, { "epoch": 0.07308040490494609, "grad_norm": 0.49267032742500305, "learning_rate": 0.0001991491353727055, "loss": 1.3809, "step": 111 }, { "epoch": 0.07373878693111678, "grad_norm": 0.526466965675354, "learning_rate": 0.000199121050779692, "loss": 1.4057, "step": 112 }, { "epoch": 0.07439716895728747, "grad_norm": 0.5559642314910889, "learning_rate": 0.00019909251223109115, "loss": 1.3517, "step": 113 }, { "epoch": 0.07505555098345815, "grad_norm": 0.6022650599479675, "learning_rate": 0.00019906351985760407, "loss": 1.3474, "step": 114 }, { "epoch": 0.07571393300962884, "grad_norm": 0.6699438095092773, "learning_rate": 0.00019903407379201038, "loss": 1.3306, "step": 115 }, { "epoch": 0.07637231503579953, "grad_norm": 0.7189972400665283, "learning_rate": 0.00019900417416916742, "loss": 1.3242, "step": 116 }, { "epoch": 0.0770306970619702, "grad_norm": 0.8303741812705994, "learning_rate": 0.0001989738211260098, "loss": 1.3087, "step": 117 }, { "epoch": 0.07768907908814089, "grad_norm": 1.0564686059951782, "learning_rate": 0.00019894301480154872, "loss": 1.2889, "step": 118 }, { "epoch": 0.07834746111431158, "grad_norm": 1.0781389474868774, "learning_rate": 0.0001989117553368712, "loss": 1.3813, "step": 119 }, { "epoch": 0.07900584314048227, "grad_norm": 1.5083551406860352, "learning_rate": 0.0001988800428751398, "loss": 1.2692, "step": 120 }, { "epoch": 0.07966422516665295, "grad_norm": 1.4822843074798584, "learning_rate": 0.00019884787756159142, "loss": 1.2816, "step": 121 }, { "epoch": 0.08032260719282364, "grad_norm": 1.7007386684417725, "learning_rate": 0.0001988152595435372, "loss": 1.4115, "step": 122 }, { "epoch": 0.08098098921899433, "grad_norm": 1.5589900016784668, "learning_rate": 0.0001987821889703614, "loss": 1.2475, "step": 123 }, { "epoch": 0.081639371245165, "grad_norm": 1.7389734983444214, "learning_rate": 0.000198748665993521, "loss": 0.9493, "step": 124 }, { "epoch": 0.08229775327133569, "grad_norm": 3.232394218444824, "learning_rate": 0.00019871469076654486, "loss": 1.4505, "step": 125 }, { "epoch": 0.08295613529750638, "grad_norm": 0.2861728072166443, "learning_rate": 0.00019868026344503306, "loss": 1.1683, "step": 126 }, { "epoch": 0.08361451732367706, "grad_norm": 0.30585670471191406, "learning_rate": 0.00019864538418665618, "loss": 1.1787, "step": 127 }, { "epoch": 0.08427289934984775, "grad_norm": 0.31589433550834656, "learning_rate": 0.00019861005315115464, "loss": 1.1875, "step": 128 }, { "epoch": 0.08493128137601844, "grad_norm": 0.3322550356388092, "learning_rate": 0.00019857427050033787, "loss": 1.3271, "step": 129 }, { "epoch": 0.08558966340218913, "grad_norm": 0.35915452241897583, "learning_rate": 0.00019853803639808355, "loss": 1.297, "step": 130 }, { "epoch": 0.0862480454283598, "grad_norm": 0.365622878074646, "learning_rate": 0.00019850135101033705, "loss": 1.3612, "step": 131 }, { "epoch": 0.08690642745453049, "grad_norm": 0.36252641677856445, "learning_rate": 0.00019846421450511045, "loss": 1.2812, "step": 132 }, { "epoch": 0.08756480948070118, "grad_norm": 0.40368425846099854, "learning_rate": 0.0001984266270524819, "loss": 1.2269, "step": 133 }, { "epoch": 0.08822319150687186, "grad_norm": 0.464525431394577, "learning_rate": 0.00019838858882459482, "loss": 1.3796, "step": 134 }, { "epoch": 0.08888157353304255, "grad_norm": 0.44522568583488464, "learning_rate": 0.00019835009999565701, "loss": 1.3407, "step": 135 }, { "epoch": 0.08953995555921324, "grad_norm": 0.5510327816009521, "learning_rate": 0.00019831116074194006, "loss": 1.4003, "step": 136 }, { "epoch": 0.09019833758538393, "grad_norm": 0.5333742499351501, "learning_rate": 0.0001982717712417783, "loss": 1.4145, "step": 137 }, { "epoch": 0.0908567196115546, "grad_norm": 0.6081604957580566, "learning_rate": 0.0001982319316755682, "loss": 1.4204, "step": 138 }, { "epoch": 0.09151510163772529, "grad_norm": 0.6653099060058594, "learning_rate": 0.00019819164222576737, "loss": 1.2717, "step": 139 }, { "epoch": 0.09217348366389598, "grad_norm": 0.708417534828186, "learning_rate": 0.0001981509030768939, "loss": 1.3953, "step": 140 }, { "epoch": 0.09283186569006666, "grad_norm": 0.8157272338867188, "learning_rate": 0.0001981097144155253, "loss": 1.4022, "step": 141 }, { "epoch": 0.09349024771623735, "grad_norm": 0.9355374574661255, "learning_rate": 0.00019806807643029783, "loss": 1.1296, "step": 142 }, { "epoch": 0.09414862974240804, "grad_norm": 1.1245189905166626, "learning_rate": 0.00019802598931190557, "loss": 1.2992, "step": 143 }, { "epoch": 0.09480701176857871, "grad_norm": 1.2917778491973877, "learning_rate": 0.00019798345325309951, "loss": 1.3327, "step": 144 }, { "epoch": 0.0954653937947494, "grad_norm": 1.3809889554977417, "learning_rate": 0.00019794046844868673, "loss": 1.209, "step": 145 }, { "epoch": 0.09612377582092009, "grad_norm": 1.7017333507537842, "learning_rate": 0.00019789703509552945, "loss": 1.3897, "step": 146 }, { "epoch": 0.09678215784709078, "grad_norm": 1.4278146028518677, "learning_rate": 0.00019785315339254422, "loss": 1.0825, "step": 147 }, { "epoch": 0.09744053987326146, "grad_norm": 1.7400321960449219, "learning_rate": 0.00019780882354070086, "loss": 1.2225, "step": 148 }, { "epoch": 0.09809892189943215, "grad_norm": 2.0169289112091064, "learning_rate": 0.0001977640457430217, "loss": 1.0443, "step": 149 }, { "epoch": 0.09875730392560284, "grad_norm": 3.1941213607788086, "learning_rate": 0.00019771882020458054, "loss": 1.4298, "step": 150 }, { "epoch": 0.09941568595177351, "grad_norm": 0.28698843717575073, "learning_rate": 0.00019767314713250178, "loss": 1.1764, "step": 151 }, { "epoch": 0.1000740679779442, "grad_norm": 0.306293785572052, "learning_rate": 0.00019762702673595942, "loss": 1.2241, "step": 152 }, { "epoch": 0.10073245000411489, "grad_norm": 0.3089601993560791, "learning_rate": 0.00019758045922617604, "loss": 1.2665, "step": 153 }, { "epoch": 0.10139083203028558, "grad_norm": 0.3189047873020172, "learning_rate": 0.00019753344481642204, "loss": 1.1575, "step": 154 }, { "epoch": 0.10204921405645626, "grad_norm": 0.3168487250804901, "learning_rate": 0.00019748598372201453, "loss": 1.3434, "step": 155 }, { "epoch": 0.10270759608262695, "grad_norm": 0.34768885374069214, "learning_rate": 0.00019743807616031626, "loss": 1.3214, "step": 156 }, { "epoch": 0.10336597810879763, "grad_norm": 0.36940696835517883, "learning_rate": 0.00019738972235073475, "loss": 1.2846, "step": 157 }, { "epoch": 0.10402436013496831, "grad_norm": 0.36799123883247375, "learning_rate": 0.0001973409225147213, "loss": 1.2152, "step": 158 }, { "epoch": 0.104682742161139, "grad_norm": 0.4010167419910431, "learning_rate": 0.00019729167687576986, "loss": 1.2015, "step": 159 }, { "epoch": 0.10534112418730969, "grad_norm": 0.4580863416194916, "learning_rate": 0.00019724198565941614, "loss": 1.3175, "step": 160 }, { "epoch": 0.10599950621348037, "grad_norm": 0.4953341484069824, "learning_rate": 0.00019719184909323647, "loss": 1.3115, "step": 161 }, { "epoch": 0.10665788823965106, "grad_norm": 0.5160828828811646, "learning_rate": 0.00019714126740684676, "loss": 1.337, "step": 162 }, { "epoch": 0.10731627026582174, "grad_norm": 0.5345748066902161, "learning_rate": 0.00019709024083190157, "loss": 1.3652, "step": 163 }, { "epoch": 0.10797465229199243, "grad_norm": 0.6039602756500244, "learning_rate": 0.0001970387696020929, "loss": 1.2858, "step": 164 }, { "epoch": 0.10863303431816311, "grad_norm": 0.6503342390060425, "learning_rate": 0.00019698685395314923, "loss": 1.335, "step": 165 }, { "epoch": 0.1092914163443338, "grad_norm": 0.8055973052978516, "learning_rate": 0.00019693449412283435, "loss": 1.4875, "step": 166 }, { "epoch": 0.10994979837050449, "grad_norm": 0.8357879519462585, "learning_rate": 0.00019688169035094632, "loss": 1.1407, "step": 167 }, { "epoch": 0.11060818039667517, "grad_norm": 0.9279759526252747, "learning_rate": 0.00019682844287931644, "loss": 1.1185, "step": 168 }, { "epoch": 0.11126656242284585, "grad_norm": 1.166600227355957, "learning_rate": 0.00019677475195180796, "loss": 1.2068, "step": 169 }, { "epoch": 0.11192494444901654, "grad_norm": 1.4295028448104858, "learning_rate": 0.0001967206178143152, "loss": 1.313, "step": 170 }, { "epoch": 0.11258332647518723, "grad_norm": 1.330005168914795, "learning_rate": 0.00019666604071476215, "loss": 1.0859, "step": 171 }, { "epoch": 0.11324170850135791, "grad_norm": 1.461003303527832, "learning_rate": 0.00019661102090310156, "loss": 1.2116, "step": 172 }, { "epoch": 0.1139000905275286, "grad_norm": 1.7286120653152466, "learning_rate": 0.00019655555863131382, "loss": 1.0759, "step": 173 }, { "epoch": 0.11455847255369929, "grad_norm": 1.9271399974822998, "learning_rate": 0.00019649965415340553, "loss": 1.052, "step": 174 }, { "epoch": 0.11521685457986997, "grad_norm": 2.438812494277954, "learning_rate": 0.00019644330772540858, "loss": 1.0191, "step": 175 }, { "epoch": 0.11587523660604065, "grad_norm": 0.38223785161972046, "learning_rate": 0.0001963865196053789, "loss": 1.1215, "step": 176 }, { "epoch": 0.11653361863221134, "grad_norm": 0.42484092712402344, "learning_rate": 0.00019632929005339533, "loss": 1.2406, "step": 177 }, { "epoch": 0.11719200065838202, "grad_norm": 0.382169246673584, "learning_rate": 0.00019627161933155832, "loss": 1.2586, "step": 178 }, { "epoch": 0.11785038268455271, "grad_norm": 0.36252737045288086, "learning_rate": 0.00019621350770398886, "loss": 1.2092, "step": 179 }, { "epoch": 0.1185087647107234, "grad_norm": 0.38078591227531433, "learning_rate": 0.00019615495543682713, "loss": 1.3193, "step": 180 }, { "epoch": 0.11916714673689409, "grad_norm": 0.36796122789382935, "learning_rate": 0.00019609596279823136, "loss": 1.3066, "step": 181 }, { "epoch": 0.11982552876306476, "grad_norm": 0.3903866112232208, "learning_rate": 0.0001960365300583767, "loss": 1.299, "step": 182 }, { "epoch": 0.12048391078923545, "grad_norm": 0.40126746892929077, "learning_rate": 0.00019597665748945385, "loss": 1.2877, "step": 183 }, { "epoch": 0.12114229281540614, "grad_norm": 0.4428742527961731, "learning_rate": 0.00019591634536566765, "loss": 1.3277, "step": 184 }, { "epoch": 0.12180067484157682, "grad_norm": 0.49272388219833374, "learning_rate": 0.00019585559396323633, "loss": 1.4497, "step": 185 }, { "epoch": 0.12245905686774751, "grad_norm": 0.48855870962142944, "learning_rate": 0.00019579440356038967, "loss": 1.3676, "step": 186 }, { "epoch": 0.1231174388939182, "grad_norm": 0.5580255389213562, "learning_rate": 0.00019573277443736813, "loss": 1.3869, "step": 187 }, { "epoch": 0.12377582092008887, "grad_norm": 0.5840893983840942, "learning_rate": 0.0001956707068764214, "loss": 1.39, "step": 188 }, { "epoch": 0.12443420294625956, "grad_norm": 0.6014588475227356, "learning_rate": 0.00019560820116180713, "loss": 1.2715, "step": 189 }, { "epoch": 0.12509258497243025, "grad_norm": 0.7011117339134216, "learning_rate": 0.0001955452575797896, "loss": 1.3443, "step": 190 }, { "epoch": 0.12575096699860094, "grad_norm": 0.7889236211776733, "learning_rate": 0.00019548187641863844, "loss": 1.3345, "step": 191 }, { "epoch": 0.12640934902477163, "grad_norm": 1.1005409955978394, "learning_rate": 0.00019541805796862737, "loss": 1.6635, "step": 192 }, { "epoch": 0.12706773105094232, "grad_norm": 0.9664353728294373, "learning_rate": 0.00019535380252203274, "loss": 1.1395, "step": 193 }, { "epoch": 0.12772611307711298, "grad_norm": 1.0788633823394775, "learning_rate": 0.00019528911037313234, "loss": 1.1039, "step": 194 }, { "epoch": 0.12838449510328367, "grad_norm": 1.373862624168396, "learning_rate": 0.00019522398181820387, "loss": 1.3558, "step": 195 }, { "epoch": 0.12904287712945436, "grad_norm": 1.7253215312957764, "learning_rate": 0.00019515841715552376, "loss": 1.1314, "step": 196 }, { "epoch": 0.12970125915562505, "grad_norm": 1.3988890647888184, "learning_rate": 0.00019509241668536572, "loss": 1.2062, "step": 197 }, { "epoch": 0.13035964118179574, "grad_norm": 1.9207713603973389, "learning_rate": 0.0001950259807099994, "loss": 1.1584, "step": 198 }, { "epoch": 0.13101802320796643, "grad_norm": 2.127577543258667, "learning_rate": 0.0001949591095336889, "loss": 1.2511, "step": 199 }, { "epoch": 0.1316764052341371, "grad_norm": 2.8832123279571533, "learning_rate": 0.00019489180346269152, "loss": 1.286, "step": 200 }, { "epoch": 0.13233478726030778, "grad_norm": 0.3227054476737976, "learning_rate": 0.00019482406280525627, "loss": 1.2693, "step": 201 }, { "epoch": 0.13299316928647847, "grad_norm": 0.3575378656387329, "learning_rate": 0.0001947558878716225, "loss": 1.2478, "step": 202 }, { "epoch": 0.13365155131264916, "grad_norm": 0.34322816133499146, "learning_rate": 0.00019468727897401853, "loss": 1.2889, "step": 203 }, { "epoch": 0.13430993333881985, "grad_norm": 0.3204440772533417, "learning_rate": 0.00019461823642666, "loss": 1.2225, "step": 204 }, { "epoch": 0.13496831536499054, "grad_norm": 0.3347187042236328, "learning_rate": 0.00019454876054574865, "loss": 1.2421, "step": 205 }, { "epoch": 0.13562669739116123, "grad_norm": 0.34115180373191833, "learning_rate": 0.00019447885164947088, "loss": 1.2813, "step": 206 }, { "epoch": 0.1362850794173319, "grad_norm": 0.3722879886627197, "learning_rate": 0.0001944085100579961, "loss": 1.3301, "step": 207 }, { "epoch": 0.13694346144350258, "grad_norm": 0.40346071124076843, "learning_rate": 0.00019433773609347553, "loss": 1.2495, "step": 208 }, { "epoch": 0.13760184346967327, "grad_norm": 0.43641212582588196, "learning_rate": 0.0001942665300800404, "loss": 1.3419, "step": 209 }, { "epoch": 0.13826022549584396, "grad_norm": 0.4808942973613739, "learning_rate": 0.00019419489234380076, "loss": 1.4305, "step": 210 }, { "epoch": 0.13891860752201465, "grad_norm": 0.4939930737018585, "learning_rate": 0.0001941228232128438, "loss": 1.4084, "step": 211 }, { "epoch": 0.13957698954818534, "grad_norm": 0.5495529770851135, "learning_rate": 0.0001940503230172325, "loss": 1.4881, "step": 212 }, { "epoch": 0.14023537157435603, "grad_norm": 0.5891100168228149, "learning_rate": 0.000193977392089004, "loss": 1.3471, "step": 213 }, { "epoch": 0.1408937536005267, "grad_norm": 0.5917908549308777, "learning_rate": 0.00019390403076216805, "loss": 1.2568, "step": 214 }, { "epoch": 0.14155213562669738, "grad_norm": 0.7141900062561035, "learning_rate": 0.0001938302393727056, "loss": 1.3666, "step": 215 }, { "epoch": 0.14221051765286807, "grad_norm": 0.8463983535766602, "learning_rate": 0.00019375601825856724, "loss": 1.3119, "step": 216 }, { "epoch": 0.14286889967903876, "grad_norm": 0.9064818024635315, "learning_rate": 0.00019368136775967156, "loss": 1.2386, "step": 217 }, { "epoch": 0.14352728170520945, "grad_norm": 1.0032778978347778, "learning_rate": 0.00019360628821790362, "loss": 1.1037, "step": 218 }, { "epoch": 0.14418566373138014, "grad_norm": 1.0552408695220947, "learning_rate": 0.00019353077997711352, "loss": 1.1999, "step": 219 }, { "epoch": 0.14484404575755083, "grad_norm": 1.2882871627807617, "learning_rate": 0.00019345484338311467, "loss": 1.223, "step": 220 }, { "epoch": 0.1455024277837215, "grad_norm": 1.3708453178405762, "learning_rate": 0.00019337847878368218, "loss": 1.0157, "step": 221 }, { "epoch": 0.14616080980989218, "grad_norm": 1.4264692068099976, "learning_rate": 0.00019330168652855148, "loss": 1.1851, "step": 222 }, { "epoch": 0.14681919183606287, "grad_norm": 1.9550893306732178, "learning_rate": 0.00019322446696941646, "loss": 1.4243, "step": 223 }, { "epoch": 0.14747757386223356, "grad_norm": 2.0592246055603027, "learning_rate": 0.00019314682045992808, "loss": 1.4496, "step": 224 }, { "epoch": 0.14813595588840425, "grad_norm": 2.4582347869873047, "learning_rate": 0.00019306874735569257, "loss": 1.3581, "step": 225 }, { "epoch": 0.14879433791457494, "grad_norm": 0.30193549394607544, "learning_rate": 0.00019299024801426994, "loss": 1.2617, "step": 226 }, { "epoch": 0.1494527199407456, "grad_norm": 0.31942540407180786, "learning_rate": 0.0001929113227951723, "loss": 1.2463, "step": 227 }, { "epoch": 0.1501111019669163, "grad_norm": 0.3396655023097992, "learning_rate": 0.0001928319720598621, "loss": 1.1814, "step": 228 }, { "epoch": 0.15076948399308698, "grad_norm": 0.32212233543395996, "learning_rate": 0.00019275219617175066, "loss": 1.2016, "step": 229 }, { "epoch": 0.15142786601925767, "grad_norm": 0.3391817808151245, "learning_rate": 0.0001926719954961964, "loss": 1.2681, "step": 230 }, { "epoch": 0.15208624804542836, "grad_norm": 0.3941498100757599, "learning_rate": 0.00019259137040050322, "loss": 1.2394, "step": 231 }, { "epoch": 0.15274463007159905, "grad_norm": 0.36959075927734375, "learning_rate": 0.00019251032125391866, "loss": 1.3547, "step": 232 }, { "epoch": 0.15340301209776974, "grad_norm": 0.3768082559108734, "learning_rate": 0.00019242884842763248, "loss": 1.2967, "step": 233 }, { "epoch": 0.1540613941239404, "grad_norm": 0.4306476414203644, "learning_rate": 0.00019234695229477475, "loss": 1.3552, "step": 234 }, { "epoch": 0.1547197761501111, "grad_norm": 0.48177775740623474, "learning_rate": 0.0001922646332304142, "loss": 1.3916, "step": 235 }, { "epoch": 0.15537815817628178, "grad_norm": 0.5183802247047424, "learning_rate": 0.00019218189161155649, "loss": 1.4014, "step": 236 }, { "epoch": 0.15603654020245247, "grad_norm": 0.5142433643341064, "learning_rate": 0.00019209872781714252, "loss": 1.3847, "step": 237 }, { "epoch": 0.15669492222862316, "grad_norm": 0.5477801561355591, "learning_rate": 0.00019201514222804672, "loss": 1.4771, "step": 238 }, { "epoch": 0.15735330425479385, "grad_norm": 0.6392645239830017, "learning_rate": 0.00019193113522707515, "loss": 1.4259, "step": 239 }, { "epoch": 0.15801168628096454, "grad_norm": 0.7238642573356628, "learning_rate": 0.000191846707198964, "loss": 1.3741, "step": 240 }, { "epoch": 0.1586700683071352, "grad_norm": 0.8954312205314636, "learning_rate": 0.0001917618585303775, "loss": 1.3341, "step": 241 }, { "epoch": 0.1593284503333059, "grad_norm": 0.9385093450546265, "learning_rate": 0.0001916765896099065, "loss": 1.2135, "step": 242 }, { "epoch": 0.15998683235947658, "grad_norm": 1.1202630996704102, "learning_rate": 0.00019159090082806647, "loss": 1.2463, "step": 243 }, { "epoch": 0.16064521438564727, "grad_norm": 1.1828258037567139, "learning_rate": 0.00019150479257729573, "loss": 1.5219, "step": 244 }, { "epoch": 0.16130359641181796, "grad_norm": 1.3535192012786865, "learning_rate": 0.00019141826525195373, "loss": 1.4133, "step": 245 }, { "epoch": 0.16196197843798865, "grad_norm": 1.1620244979858398, "learning_rate": 0.00019133131924831917, "loss": 0.8327, "step": 246 }, { "epoch": 0.16262036046415934, "grad_norm": 1.342806339263916, "learning_rate": 0.00019124395496458827, "loss": 1.1851, "step": 247 }, { "epoch": 0.16327874249033, "grad_norm": 1.7129833698272705, "learning_rate": 0.0001911561728008728, "loss": 1.0681, "step": 248 }, { "epoch": 0.1639371245165007, "grad_norm": 1.9841933250427246, "learning_rate": 0.00019106797315919844, "loss": 1.4264, "step": 249 }, { "epoch": 0.16459550654267138, "grad_norm": 2.406813144683838, "learning_rate": 0.0001909793564435028, "loss": 1.0905, "step": 250 }, { "epoch": 0.16525388856884207, "grad_norm": 0.35957545042037964, "learning_rate": 0.00019089032305963358, "loss": 1.1371, "step": 251 }, { "epoch": 0.16591227059501276, "grad_norm": 0.3548797369003296, "learning_rate": 0.0001908008734153468, "loss": 1.2211, "step": 252 }, { "epoch": 0.16657065262118345, "grad_norm": 0.37374186515808105, "learning_rate": 0.00019071100792030486, "loss": 1.2192, "step": 253 }, { "epoch": 0.1672290346473541, "grad_norm": 0.35945966839790344, "learning_rate": 0.00019062072698607457, "loss": 1.2436, "step": 254 }, { "epoch": 0.1678874166735248, "grad_norm": 0.3470393419265747, "learning_rate": 0.00019053003102612555, "loss": 1.2912, "step": 255 }, { "epoch": 0.1685457986996955, "grad_norm": 0.38478219509124756, "learning_rate": 0.00019043892045582803, "loss": 1.2895, "step": 256 }, { "epoch": 0.16920418072586618, "grad_norm": 0.3745102882385254, "learning_rate": 0.00019034739569245113, "loss": 1.2061, "step": 257 }, { "epoch": 0.16986256275203687, "grad_norm": 0.4094540476799011, "learning_rate": 0.00019025545715516085, "loss": 1.2946, "step": 258 }, { "epoch": 0.17052094477820756, "grad_norm": 0.4305444359779358, "learning_rate": 0.0001901631052650183, "loss": 1.2985, "step": 259 }, { "epoch": 0.17117932680437825, "grad_norm": 0.4470147490501404, "learning_rate": 0.00019007034044497755, "loss": 1.2582, "step": 260 }, { "epoch": 0.1718377088305489, "grad_norm": 0.4666734039783478, "learning_rate": 0.0001899771631198839, "loss": 1.3592, "step": 261 }, { "epoch": 0.1724960908567196, "grad_norm": 0.5121479630470276, "learning_rate": 0.00018988357371647172, "loss": 1.4201, "step": 262 }, { "epoch": 0.1731544728828903, "grad_norm": 0.5547329783439636, "learning_rate": 0.00018978957266336277, "loss": 1.3465, "step": 263 }, { "epoch": 0.17381285490906098, "grad_norm": 0.5466268062591553, "learning_rate": 0.00018969516039106402, "loss": 1.3634, "step": 264 }, { "epoch": 0.17447123693523167, "grad_norm": 0.6425915360450745, "learning_rate": 0.0001896003373319657, "loss": 1.3324, "step": 265 }, { "epoch": 0.17512961896140236, "grad_norm": 0.7411608695983887, "learning_rate": 0.00018950510392033945, "loss": 1.2149, "step": 266 }, { "epoch": 0.17578800098757305, "grad_norm": 0.7877213358879089, "learning_rate": 0.00018940946059233622, "loss": 1.3433, "step": 267 }, { "epoch": 0.1764463830137437, "grad_norm": 0.9388232231140137, "learning_rate": 0.00018931340778598424, "loss": 1.2564, "step": 268 }, { "epoch": 0.1771047650399144, "grad_norm": 1.0754741430282593, "learning_rate": 0.00018921694594118717, "loss": 1.1404, "step": 269 }, { "epoch": 0.1777631470660851, "grad_norm": 1.3300153017044067, "learning_rate": 0.0001891200754997219, "loss": 1.055, "step": 270 }, { "epoch": 0.17842152909225578, "grad_norm": 1.3857688903808594, "learning_rate": 0.00018902279690523669, "loss": 0.9823, "step": 271 }, { "epoch": 0.17907991111842647, "grad_norm": 1.919142246246338, "learning_rate": 0.00018892511060324897, "loss": 1.4421, "step": 272 }, { "epoch": 0.17973829314459716, "grad_norm": 2.0909218788146973, "learning_rate": 0.0001888270170411435, "loss": 1.2757, "step": 273 }, { "epoch": 0.18039667517076785, "grad_norm": 1.6821421384811401, "learning_rate": 0.00018872851666817017, "loss": 1.07, "step": 274 }, { "epoch": 0.1810550571969385, "grad_norm": 3.6287171840667725, "learning_rate": 0.00018862960993544197, "loss": 1.3496, "step": 275 }, { "epoch": 0.1817134392231092, "grad_norm": 0.37574461102485657, "learning_rate": 0.00018853029729593295, "loss": 1.1978, "step": 276 }, { "epoch": 0.1823718212492799, "grad_norm": 0.3107951283454895, "learning_rate": 0.0001884305792044761, "loss": 1.2306, "step": 277 }, { "epoch": 0.18303020327545058, "grad_norm": 0.32990792393684387, "learning_rate": 0.0001883304561177614, "loss": 1.1505, "step": 278 }, { "epoch": 0.18368858530162127, "grad_norm": 0.3513798117637634, "learning_rate": 0.00018822992849433355, "loss": 1.2201, "step": 279 }, { "epoch": 0.18434696732779196, "grad_norm": 0.3479711711406708, "learning_rate": 0.00018812899679458993, "loss": 1.3056, "step": 280 }, { "epoch": 0.18500534935396265, "grad_norm": 0.33775678277015686, "learning_rate": 0.0001880276614807786, "loss": 1.1926, "step": 281 }, { "epoch": 0.1856637313801333, "grad_norm": 0.39962244033813477, "learning_rate": 0.0001879259230169961, "loss": 1.3247, "step": 282 }, { "epoch": 0.186322113406304, "grad_norm": 0.4246426820755005, "learning_rate": 0.00018782378186918514, "loss": 1.3709, "step": 283 }, { "epoch": 0.1869804954324747, "grad_norm": 0.4720629155635834, "learning_rate": 0.00018772123850513287, "loss": 1.2674, "step": 284 }, { "epoch": 0.18763887745864538, "grad_norm": 0.4945620894432068, "learning_rate": 0.00018761829339446843, "loss": 1.3351, "step": 285 }, { "epoch": 0.18829725948481607, "grad_norm": 0.49803289771080017, "learning_rate": 0.00018751494700866087, "loss": 1.2706, "step": 286 }, { "epoch": 0.18895564151098676, "grad_norm": 0.5121004581451416, "learning_rate": 0.00018741119982101695, "loss": 1.3867, "step": 287 }, { "epoch": 0.18961402353715742, "grad_norm": 0.5488736629486084, "learning_rate": 0.00018730705230667914, "loss": 1.4509, "step": 288 }, { "epoch": 0.1902724055633281, "grad_norm": 0.6060997843742371, "learning_rate": 0.00018720250494262328, "loss": 1.2907, "step": 289 }, { "epoch": 0.1909307875894988, "grad_norm": 0.6332529783248901, "learning_rate": 0.0001870975582076564, "loss": 1.3268, "step": 290 }, { "epoch": 0.1915891696156695, "grad_norm": 0.7396805882453918, "learning_rate": 0.00018699221258241467, "loss": 1.1929, "step": 291 }, { "epoch": 0.19224755164184018, "grad_norm": 0.7928360104560852, "learning_rate": 0.00018688646854936097, "loss": 1.3086, "step": 292 }, { "epoch": 0.19290593366801087, "grad_norm": 1.0803829431533813, "learning_rate": 0.00018678032659278295, "loss": 1.3736, "step": 293 }, { "epoch": 0.19356431569418156, "grad_norm": 1.1778184175491333, "learning_rate": 0.0001866737871987906, "loss": 1.1925, "step": 294 }, { "epoch": 0.19422269772035222, "grad_norm": 1.1690232753753662, "learning_rate": 0.00018656685085531415, "loss": 1.2878, "step": 295 }, { "epoch": 0.1948810797465229, "grad_norm": 1.2318227291107178, "learning_rate": 0.00018645951805210164, "loss": 0.8996, "step": 296 }, { "epoch": 0.1955394617726936, "grad_norm": 1.275986671447754, "learning_rate": 0.000186351789280717, "loss": 0.9593, "step": 297 }, { "epoch": 0.1961978437988643, "grad_norm": 1.4704487323760986, "learning_rate": 0.00018624366503453753, "loss": 1.0258, "step": 298 }, { "epoch": 0.19685622582503498, "grad_norm": 2.0199921131134033, "learning_rate": 0.00018613514580875174, "loss": 1.0027, "step": 299 }, { "epoch": 0.19751460785120567, "grad_norm": 3.0707104206085205, "learning_rate": 0.00018602623210035705, "loss": 1.435, "step": 300 }, { "epoch": 0.19817298987737636, "grad_norm": 0.33572354912757874, "learning_rate": 0.0001859169244081576, "loss": 1.2316, "step": 301 }, { "epoch": 0.19883137190354702, "grad_norm": 0.3585915267467499, "learning_rate": 0.00018580722323276186, "loss": 1.2425, "step": 302 }, { "epoch": 0.1994897539297177, "grad_norm": 0.3390888273715973, "learning_rate": 0.0001856971290765803, "loss": 1.2145, "step": 303 }, { "epoch": 0.2001481359558884, "grad_norm": 0.3473513424396515, "learning_rate": 0.00018558664244382338, "loss": 1.1988, "step": 304 }, { "epoch": 0.2008065179820591, "grad_norm": 0.33148831129074097, "learning_rate": 0.0001854757638404988, "loss": 1.2158, "step": 305 }, { "epoch": 0.20146490000822978, "grad_norm": 0.3568115532398224, "learning_rate": 0.0001853644937744095, "loss": 1.2566, "step": 306 }, { "epoch": 0.20212328203440047, "grad_norm": 0.3686254620552063, "learning_rate": 0.0001852528327551513, "loss": 1.2466, "step": 307 }, { "epoch": 0.20278166406057116, "grad_norm": 0.38901326060295105, "learning_rate": 0.00018514078129411044, "loss": 1.3647, "step": 308 }, { "epoch": 0.20344004608674182, "grad_norm": 0.41585972905158997, "learning_rate": 0.0001850283399044613, "loss": 1.2619, "step": 309 }, { "epoch": 0.2040984281129125, "grad_norm": 0.4686797261238098, "learning_rate": 0.00018491550910116415, "loss": 1.3821, "step": 310 }, { "epoch": 0.2047568101390832, "grad_norm": 0.5280426740646362, "learning_rate": 0.00018480228940096258, "loss": 1.3297, "step": 311 }, { "epoch": 0.2054151921652539, "grad_norm": 0.5117961168289185, "learning_rate": 0.00018468868132238137, "loss": 1.2887, "step": 312 }, { "epoch": 0.20607357419142458, "grad_norm": 0.6155792474746704, "learning_rate": 0.00018457468538572388, "loss": 1.2839, "step": 313 }, { "epoch": 0.20673195621759527, "grad_norm": 0.6095795631408691, "learning_rate": 0.00018446030211306992, "loss": 1.3214, "step": 314 }, { "epoch": 0.20739033824376593, "grad_norm": 0.67921382188797, "learning_rate": 0.00018434553202827312, "loss": 1.2865, "step": 315 }, { "epoch": 0.20804872026993662, "grad_norm": 0.7645331621170044, "learning_rate": 0.00018423037565695865, "loss": 1.5272, "step": 316 }, { "epoch": 0.2087071022961073, "grad_norm": 0.824537456035614, "learning_rate": 0.00018411483352652087, "loss": 1.0638, "step": 317 }, { "epoch": 0.209365484322278, "grad_norm": 0.9520829319953918, "learning_rate": 0.00018399890616612073, "loss": 1.2512, "step": 318 }, { "epoch": 0.2100238663484487, "grad_norm": 1.1447970867156982, "learning_rate": 0.00018388259410668357, "loss": 1.2487, "step": 319 }, { "epoch": 0.21068224837461938, "grad_norm": 1.2482191324234009, "learning_rate": 0.00018376589788089652, "loss": 1.2008, "step": 320 }, { "epoch": 0.21134063040079007, "grad_norm": 1.1981897354125977, "learning_rate": 0.00018364881802320613, "loss": 1.1732, "step": 321 }, { "epoch": 0.21199901242696073, "grad_norm": 1.2416824102401733, "learning_rate": 0.0001835313550698159, "loss": 1.1845, "step": 322 }, { "epoch": 0.21265739445313142, "grad_norm": 1.7832579612731934, "learning_rate": 0.00018341350955868388, "loss": 1.2821, "step": 323 }, { "epoch": 0.2133157764793021, "grad_norm": 1.8579916954040527, "learning_rate": 0.0001832952820295201, "loss": 1.0528, "step": 324 }, { "epoch": 0.2139741585054728, "grad_norm": 2.66621994972229, "learning_rate": 0.00018317667302378423, "loss": 1.1218, "step": 325 }, { "epoch": 0.2146325405316435, "grad_norm": 0.30232468247413635, "learning_rate": 0.00018305768308468293, "loss": 1.1782, "step": 326 }, { "epoch": 0.21529092255781418, "grad_norm": 0.31499120593070984, "learning_rate": 0.00018293831275716755, "loss": 1.156, "step": 327 }, { "epoch": 0.21594930458398487, "grad_norm": 0.339660108089447, "learning_rate": 0.00018281856258793152, "loss": 1.2193, "step": 328 }, { "epoch": 0.21660768661015553, "grad_norm": 0.31181803345680237, "learning_rate": 0.0001826984331254079, "loss": 1.2299, "step": 329 }, { "epoch": 0.21726606863632622, "grad_norm": 0.34332069754600525, "learning_rate": 0.00018257792491976676, "loss": 1.2874, "step": 330 }, { "epoch": 0.2179244506624969, "grad_norm": 0.33105602860450745, "learning_rate": 0.00018245703852291282, "loss": 1.2185, "step": 331 }, { "epoch": 0.2185828326886676, "grad_norm": 0.3488101065158844, "learning_rate": 0.00018233577448848284, "loss": 1.259, "step": 332 }, { "epoch": 0.2192412147148383, "grad_norm": 0.40130531787872314, "learning_rate": 0.00018221413337184302, "loss": 1.3751, "step": 333 }, { "epoch": 0.21989959674100898, "grad_norm": 0.4241950511932373, "learning_rate": 0.0001820921157300866, "loss": 1.2631, "step": 334 }, { "epoch": 0.22055797876717967, "grad_norm": 0.40443339943885803, "learning_rate": 0.00018196972212203126, "loss": 1.2457, "step": 335 }, { "epoch": 0.22121636079335033, "grad_norm": 0.444219708442688, "learning_rate": 0.00018184695310821637, "loss": 1.1849, "step": 336 }, { "epoch": 0.22187474281952102, "grad_norm": 0.49441009759902954, "learning_rate": 0.00018172380925090073, "loss": 1.3757, "step": 337 }, { "epoch": 0.2225331248456917, "grad_norm": 0.487322598695755, "learning_rate": 0.00018160029111405988, "loss": 1.2578, "step": 338 }, { "epoch": 0.2231915068718624, "grad_norm": 0.5444957613945007, "learning_rate": 0.0001814763992633834, "loss": 1.3719, "step": 339 }, { "epoch": 0.2238498888980331, "grad_norm": 0.6574757099151611, "learning_rate": 0.00018135213426627238, "loss": 1.3716, "step": 340 }, { "epoch": 0.22450827092420378, "grad_norm": 0.7351252436637878, "learning_rate": 0.000181227496691837, "loss": 1.2281, "step": 341 }, { "epoch": 0.22516665295037447, "grad_norm": 0.7623188495635986, "learning_rate": 0.0001811024871108936, "loss": 1.1949, "step": 342 }, { "epoch": 0.22582503497654513, "grad_norm": 0.9115209579467773, "learning_rate": 0.00018097710609596234, "loss": 1.3579, "step": 343 }, { "epoch": 0.22648341700271582, "grad_norm": 1.0563344955444336, "learning_rate": 0.00018085135422126446, "loss": 1.1628, "step": 344 }, { "epoch": 0.2271417990288865, "grad_norm": 1.0983742475509644, "learning_rate": 0.00018072523206271967, "loss": 1.1546, "step": 345 }, { "epoch": 0.2278001810550572, "grad_norm": 1.203003168106079, "learning_rate": 0.00018059874019794351, "loss": 1.099, "step": 346 }, { "epoch": 0.2284585630812279, "grad_norm": 1.3815973997116089, "learning_rate": 0.0001804718792062447, "loss": 1.1408, "step": 347 }, { "epoch": 0.22911694510739858, "grad_norm": 1.4059642553329468, "learning_rate": 0.00018034464966862245, "loss": 0.8299, "step": 348 }, { "epoch": 0.22977532713356924, "grad_norm": 1.6873667240142822, "learning_rate": 0.0001802170521677639, "loss": 0.98, "step": 349 }, { "epoch": 0.23043370915973993, "grad_norm": 2.4365310668945312, "learning_rate": 0.0001800890872880414, "loss": 1.0824, "step": 350 }, { "epoch": 0.23109209118591062, "grad_norm": 0.2800106108188629, "learning_rate": 0.00017996075561550976, "loss": 1.2194, "step": 351 }, { "epoch": 0.2317504732120813, "grad_norm": 0.31066974997520447, "learning_rate": 0.00017983205773790367, "loss": 1.1918, "step": 352 }, { "epoch": 0.232408855238252, "grad_norm": 0.3044240176677704, "learning_rate": 0.00017970299424463498, "loss": 1.2516, "step": 353 }, { "epoch": 0.2330672372644227, "grad_norm": 0.3345501124858856, "learning_rate": 0.00017957356572679, "loss": 1.2741, "step": 354 }, { "epoch": 0.23372561929059338, "grad_norm": 0.3216431736946106, "learning_rate": 0.00017944377277712668, "loss": 1.2471, "step": 355 }, { "epoch": 0.23438400131676404, "grad_norm": 0.33792850375175476, "learning_rate": 0.00017931361599007216, "loss": 1.2639, "step": 356 }, { "epoch": 0.23504238334293473, "grad_norm": 0.3501819670200348, "learning_rate": 0.0001791830959617198, "loss": 1.2292, "step": 357 }, { "epoch": 0.23570076536910542, "grad_norm": 0.3798786401748657, "learning_rate": 0.00017905221328982647, "loss": 1.2669, "step": 358 }, { "epoch": 0.2363591473952761, "grad_norm": 0.41154614090919495, "learning_rate": 0.00017892096857381002, "loss": 1.326, "step": 359 }, { "epoch": 0.2370175294214468, "grad_norm": 0.44466957449913025, "learning_rate": 0.0001787893624147463, "loss": 1.3761, "step": 360 }, { "epoch": 0.2376759114476175, "grad_norm": 0.5049508810043335, "learning_rate": 0.00017865739541536653, "loss": 1.3208, "step": 361 }, { "epoch": 0.23833429347378818, "grad_norm": 0.4856434166431427, "learning_rate": 0.00017852506818005447, "loss": 1.3196, "step": 362 }, { "epoch": 0.23899267549995884, "grad_norm": 0.5382542610168457, "learning_rate": 0.00017839238131484378, "loss": 1.37, "step": 363 }, { "epoch": 0.23965105752612953, "grad_norm": 0.5595437288284302, "learning_rate": 0.00017825933542741507, "loss": 1.2228, "step": 364 }, { "epoch": 0.24030943955230022, "grad_norm": 0.6511120796203613, "learning_rate": 0.00017812593112709324, "loss": 1.2641, "step": 365 }, { "epoch": 0.2409678215784709, "grad_norm": 0.7370897531509399, "learning_rate": 0.00017799216902484466, "loss": 1.2019, "step": 366 }, { "epoch": 0.2416262036046416, "grad_norm": 0.8269248604774475, "learning_rate": 0.00017785804973327433, "loss": 1.2945, "step": 367 }, { "epoch": 0.2422845856308123, "grad_norm": 1.1307923793792725, "learning_rate": 0.00017772357386662316, "loss": 1.4, "step": 368 }, { "epoch": 0.24294296765698298, "grad_norm": 1.285451889038086, "learning_rate": 0.00017758874204076505, "loss": 1.1158, "step": 369 }, { "epoch": 0.24360134968315364, "grad_norm": 1.386724591255188, "learning_rate": 0.00017745355487320418, "loss": 1.3709, "step": 370 }, { "epoch": 0.24425973170932433, "grad_norm": 1.353636622428894, "learning_rate": 0.00017731801298307212, "loss": 1.1778, "step": 371 }, { "epoch": 0.24491811373549502, "grad_norm": 1.605360984802246, "learning_rate": 0.00017718211699112494, "loss": 1.1777, "step": 372 }, { "epoch": 0.2455764957616657, "grad_norm": 1.677512288093567, "learning_rate": 0.00017704586751974048, "loss": 1.2474, "step": 373 }, { "epoch": 0.2462348777878364, "grad_norm": 1.7870368957519531, "learning_rate": 0.00017690926519291548, "loss": 0.9307, "step": 374 }, { "epoch": 0.2468932598140071, "grad_norm": 3.0082199573516846, "learning_rate": 0.00017677231063626265, "loss": 1.4513, "step": 375 }, { "epoch": 0.24755164184017775, "grad_norm": 0.25582268834114075, "learning_rate": 0.00017663500447700779, "loss": 1.1852, "step": 376 }, { "epoch": 0.24821002386634844, "grad_norm": 0.29881551861763, "learning_rate": 0.00017649734734398708, "loss": 1.2946, "step": 377 }, { "epoch": 0.24886840589251913, "grad_norm": 0.3092018663883209, "learning_rate": 0.00017635933986764402, "loss": 1.3313, "step": 378 }, { "epoch": 0.24952678791868982, "grad_norm": 0.3199716806411743, "learning_rate": 0.00017622098268002667, "loss": 1.2045, "step": 379 }, { "epoch": 0.2501851699448605, "grad_norm": 0.3237549662590027, "learning_rate": 0.00017608227641478466, "loss": 1.2029, "step": 380 }, { "epoch": 0.2501851699448605, "eval_loss": 1.2363184690475464, "eval_runtime": 142.0449, "eval_samples_per_second": 18.008, "eval_steps_per_second": 4.506, "step": 380 }, { "epoch": 0.2508435519710312, "grad_norm": 0.34310024976730347, "learning_rate": 0.00017594322170716634, "loss": 1.259, "step": 381 }, { "epoch": 0.2515019339972019, "grad_norm": 0.35043632984161377, "learning_rate": 0.00017580381919401586, "loss": 1.2779, "step": 382 }, { "epoch": 0.2521603160233726, "grad_norm": 0.38710206747055054, "learning_rate": 0.00017566406951377025, "loss": 1.3103, "step": 383 }, { "epoch": 0.25281869804954327, "grad_norm": 0.39176538586616516, "learning_rate": 0.0001755239733064565, "loss": 1.2745, "step": 384 }, { "epoch": 0.25347708007571396, "grad_norm": 0.46667057275772095, "learning_rate": 0.00017538353121368866, "loss": 1.2877, "step": 385 }, { "epoch": 0.25413546210188465, "grad_norm": 0.4763161242008209, "learning_rate": 0.00017524274387866484, "loss": 1.3045, "step": 386 }, { "epoch": 0.2547938441280553, "grad_norm": 0.5062443017959595, "learning_rate": 0.00017510161194616425, "loss": 1.248, "step": 387 }, { "epoch": 0.25545222615422597, "grad_norm": 0.5613611936569214, "learning_rate": 0.0001749601360625444, "loss": 1.2601, "step": 388 }, { "epoch": 0.25611060818039666, "grad_norm": 0.5726426243782043, "learning_rate": 0.0001748183168757379, "loss": 1.2773, "step": 389 }, { "epoch": 0.25676899020656735, "grad_norm": 0.6445503830909729, "learning_rate": 0.0001746761550352497, "loss": 1.2682, "step": 390 }, { "epoch": 0.25742737223273804, "grad_norm": 0.776296079158783, "learning_rate": 0.00017453365119215412, "loss": 1.3489, "step": 391 }, { "epoch": 0.25808575425890873, "grad_norm": 0.9601802229881287, "learning_rate": 0.00017439080599909162, "loss": 1.144, "step": 392 }, { "epoch": 0.2587441362850794, "grad_norm": 1.026221752166748, "learning_rate": 0.00017424762011026602, "loss": 1.501, "step": 393 }, { "epoch": 0.2594025183112501, "grad_norm": 1.1421236991882324, "learning_rate": 0.00017410409418144157, "loss": 1.3062, "step": 394 }, { "epoch": 0.2600609003374208, "grad_norm": 1.2660785913467407, "learning_rate": 0.00017396022886993973, "loss": 1.1689, "step": 395 }, { "epoch": 0.2607192823635915, "grad_norm": 1.3290542364120483, "learning_rate": 0.00017381602483463636, "loss": 1.0255, "step": 396 }, { "epoch": 0.2613776643897622, "grad_norm": 1.456329107284546, "learning_rate": 0.00017367148273595846, "loss": 1.129, "step": 397 }, { "epoch": 0.26203604641593287, "grad_norm": 1.6241428852081299, "learning_rate": 0.00017352660323588146, "loss": 1.323, "step": 398 }, { "epoch": 0.26269442844210356, "grad_norm": 1.8210939168930054, "learning_rate": 0.0001733813869979259, "loss": 0.8274, "step": 399 }, { "epoch": 0.2633528104682742, "grad_norm": 3.559173107147217, "learning_rate": 0.00017323583468715462, "loss": 1.7466, "step": 400 }, { "epoch": 0.2640111924944449, "grad_norm": 0.3388063609600067, "learning_rate": 0.00017308994697016954, "loss": 1.267, "step": 401 }, { "epoch": 0.26466957452061557, "grad_norm": 0.29530099034309387, "learning_rate": 0.0001729437245151087, "loss": 1.2249, "step": 402 }, { "epoch": 0.26532795654678626, "grad_norm": 0.30938857793807983, "learning_rate": 0.00017279716799164317, "loss": 1.2312, "step": 403 }, { "epoch": 0.26598633857295695, "grad_norm": 0.3707611560821533, "learning_rate": 0.000172650278070974, "loss": 1.2121, "step": 404 }, { "epoch": 0.26664472059912764, "grad_norm": 0.354686975479126, "learning_rate": 0.0001725030554258291, "loss": 1.1451, "step": 405 }, { "epoch": 0.26730310262529833, "grad_norm": 0.36545711755752563, "learning_rate": 0.00017235550073046028, "loss": 1.2994, "step": 406 }, { "epoch": 0.267961484651469, "grad_norm": 0.37162286043167114, "learning_rate": 0.00017220761466064, "loss": 1.4676, "step": 407 }, { "epoch": 0.2686198666776397, "grad_norm": 0.3644770681858063, "learning_rate": 0.00017205939789365833, "loss": 1.1517, "step": 408 }, { "epoch": 0.2692782487038104, "grad_norm": 0.4095366299152374, "learning_rate": 0.00017191085110831992, "loss": 1.359, "step": 409 }, { "epoch": 0.2699366307299811, "grad_norm": 0.44034093618392944, "learning_rate": 0.0001717619749849409, "loss": 1.3006, "step": 410 }, { "epoch": 0.2705950127561518, "grad_norm": 0.47280147671699524, "learning_rate": 0.0001716127702053456, "loss": 1.3272, "step": 411 }, { "epoch": 0.27125339478232247, "grad_norm": 0.49806755781173706, "learning_rate": 0.00017146323745286356, "loss": 1.2058, "step": 412 }, { "epoch": 0.27191177680849316, "grad_norm": 0.5420827865600586, "learning_rate": 0.00017131337741232644, "loss": 1.2231, "step": 413 }, { "epoch": 0.2725701588346638, "grad_norm": 0.6424184441566467, "learning_rate": 0.00017116319077006465, "loss": 1.3292, "step": 414 }, { "epoch": 0.2732285408608345, "grad_norm": 0.747867226600647, "learning_rate": 0.00017101267821390461, "loss": 1.3367, "step": 415 }, { "epoch": 0.27388692288700517, "grad_norm": 0.8218575716018677, "learning_rate": 0.0001708618404331652, "loss": 1.3283, "step": 416 }, { "epoch": 0.27454530491317586, "grad_norm": 0.8806688189506531, "learning_rate": 0.00017071067811865476, "loss": 1.1996, "step": 417 }, { "epoch": 0.27520368693934655, "grad_norm": 0.9571687579154968, "learning_rate": 0.00017055919196266804, "loss": 1.3096, "step": 418 }, { "epoch": 0.27586206896551724, "grad_norm": 1.098653793334961, "learning_rate": 0.00017040738265898287, "loss": 1.3812, "step": 419 }, { "epoch": 0.2765204509916879, "grad_norm": 1.294609546661377, "learning_rate": 0.000170255250902857, "loss": 1.3015, "step": 420 }, { "epoch": 0.2771788330178586, "grad_norm": 1.3969992399215698, "learning_rate": 0.00017010279739102497, "loss": 0.9505, "step": 421 }, { "epoch": 0.2778372150440293, "grad_norm": 1.490425705909729, "learning_rate": 0.00016995002282169493, "loss": 1.2844, "step": 422 }, { "epoch": 0.2784955970702, "grad_norm": 1.7336398363113403, "learning_rate": 0.0001697969278945454, "loss": 1.0083, "step": 423 }, { "epoch": 0.2791539790963707, "grad_norm": 1.5340614318847656, "learning_rate": 0.00016964351331072203, "loss": 0.8924, "step": 424 }, { "epoch": 0.2798123611225414, "grad_norm": 2.070929527282715, "learning_rate": 0.00016948977977283454, "loss": 1.0939, "step": 425 }, { "epoch": 0.28047074314871207, "grad_norm": 0.28991639614105225, "learning_rate": 0.00016933572798495328, "loss": 1.1573, "step": 426 }, { "epoch": 0.2811291251748827, "grad_norm": 0.31519120931625366, "learning_rate": 0.00016918135865260622, "loss": 1.215, "step": 427 }, { "epoch": 0.2817875072010534, "grad_norm": 0.31383201479911804, "learning_rate": 0.00016902667248277554, "loss": 1.2542, "step": 428 }, { "epoch": 0.2824458892272241, "grad_norm": 0.3328154385089874, "learning_rate": 0.0001688716701838946, "loss": 1.3019, "step": 429 }, { "epoch": 0.28310427125339477, "grad_norm": 0.3409973382949829, "learning_rate": 0.0001687163524658444, "loss": 1.1942, "step": 430 }, { "epoch": 0.28376265327956546, "grad_norm": 0.3398495316505432, "learning_rate": 0.0001685607200399506, "loss": 1.2197, "step": 431 }, { "epoch": 0.28442103530573615, "grad_norm": 0.345429927110672, "learning_rate": 0.00016840477361898022, "loss": 1.1713, "step": 432 }, { "epoch": 0.28507941733190684, "grad_norm": 0.3922118544578552, "learning_rate": 0.0001682485139171382, "loss": 1.2122, "step": 433 }, { "epoch": 0.2857377993580775, "grad_norm": 0.4266889989376068, "learning_rate": 0.00016809194165006433, "loss": 1.385, "step": 434 }, { "epoch": 0.2863961813842482, "grad_norm": 0.44210320711135864, "learning_rate": 0.0001679350575348298, "loss": 1.1992, "step": 435 }, { "epoch": 0.2870545634104189, "grad_norm": 0.45710140466690063, "learning_rate": 0.0001677778622899341, "loss": 1.3148, "step": 436 }, { "epoch": 0.2877129454365896, "grad_norm": 0.49983176589012146, "learning_rate": 0.0001676203566353016, "loss": 1.2936, "step": 437 }, { "epoch": 0.2883713274627603, "grad_norm": 0.5122978091239929, "learning_rate": 0.00016746254129227828, "loss": 1.3741, "step": 438 }, { "epoch": 0.289029709488931, "grad_norm": 0.5842157602310181, "learning_rate": 0.00016730441698362844, "loss": 1.3116, "step": 439 }, { "epoch": 0.28968809151510166, "grad_norm": 0.6748195886611938, "learning_rate": 0.00016714598443353138, "loss": 1.3625, "step": 440 }, { "epoch": 0.2903464735412723, "grad_norm": 0.682346761226654, "learning_rate": 0.0001669872443675781, "loss": 1.2786, "step": 441 }, { "epoch": 0.291004855567443, "grad_norm": 0.824905514717102, "learning_rate": 0.0001668281975127679, "loss": 1.2931, "step": 442 }, { "epoch": 0.2916632375936137, "grad_norm": 1.0475374460220337, "learning_rate": 0.00016666884459750527, "loss": 1.1593, "step": 443 }, { "epoch": 0.29232161961978437, "grad_norm": 0.9741135239601135, "learning_rate": 0.00016650918635159617, "loss": 1.284, "step": 444 }, { "epoch": 0.29298000164595506, "grad_norm": 1.258925199508667, "learning_rate": 0.00016634922350624518, "loss": 1.0439, "step": 445 }, { "epoch": 0.29363838367212575, "grad_norm": 1.1946061849594116, "learning_rate": 0.00016618895679405165, "loss": 1.0606, "step": 446 }, { "epoch": 0.29429676569829644, "grad_norm": 1.4616057872772217, "learning_rate": 0.00016602838694900672, "loss": 0.8763, "step": 447 }, { "epoch": 0.2949551477244671, "grad_norm": 1.498363733291626, "learning_rate": 0.00016586751470648977, "loss": 1.1925, "step": 448 }, { "epoch": 0.2956135297506378, "grad_norm": 2.09547758102417, "learning_rate": 0.00016570634080326517, "loss": 1.135, "step": 449 }, { "epoch": 0.2962719117768085, "grad_norm": 2.747232437133789, "learning_rate": 0.00016554486597747872, "loss": 1.4001, "step": 450 }, { "epoch": 0.2969302938029792, "grad_norm": 0.35306617617607117, "learning_rate": 0.00016538309096865444, "loss": 1.1834, "step": 451 }, { "epoch": 0.2975886758291499, "grad_norm": 0.3974539637565613, "learning_rate": 0.00016522101651769125, "loss": 1.2719, "step": 452 }, { "epoch": 0.2982470578553206, "grad_norm": 0.3502292037010193, "learning_rate": 0.0001650586433668593, "loss": 1.3184, "step": 453 }, { "epoch": 0.2989054398814912, "grad_norm": 0.4166954755783081, "learning_rate": 0.0001648959722597967, "loss": 1.1237, "step": 454 }, { "epoch": 0.2995638219076619, "grad_norm": 0.3814135193824768, "learning_rate": 0.00016473300394150635, "loss": 1.2911, "step": 455 }, { "epoch": 0.3002222039338326, "grad_norm": 0.36384913325309753, "learning_rate": 0.00016456973915835214, "loss": 1.1706, "step": 456 }, { "epoch": 0.3008805859600033, "grad_norm": 0.3671210706233978, "learning_rate": 0.00016440617865805574, "loss": 1.2955, "step": 457 }, { "epoch": 0.30153896798617397, "grad_norm": 0.4029741585254669, "learning_rate": 0.00016424232318969326, "loss": 1.4549, "step": 458 }, { "epoch": 0.30219735001234466, "grad_norm": 0.3963627219200134, "learning_rate": 0.00016407817350369154, "loss": 1.2036, "step": 459 }, { "epoch": 0.30285573203851535, "grad_norm": 0.44051656126976013, "learning_rate": 0.00016391373035182504, "loss": 1.459, "step": 460 }, { "epoch": 0.30351411406468604, "grad_norm": 0.47413092851638794, "learning_rate": 0.0001637489944872121, "loss": 1.2193, "step": 461 }, { "epoch": 0.3041724960908567, "grad_norm": 0.5480075478553772, "learning_rate": 0.00016358396666431174, "loss": 1.3583, "step": 462 }, { "epoch": 0.3048308781170274, "grad_norm": 0.5907076597213745, "learning_rate": 0.00016341864763891998, "loss": 1.2816, "step": 463 }, { "epoch": 0.3054892601431981, "grad_norm": 0.6378355026245117, "learning_rate": 0.00016325303816816668, "loss": 1.3787, "step": 464 }, { "epoch": 0.3061476421693688, "grad_norm": 0.7638369202613831, "learning_rate": 0.00016308713901051164, "loss": 1.2706, "step": 465 }, { "epoch": 0.3068060241955395, "grad_norm": 0.8754927515983582, "learning_rate": 0.00016292095092574154, "loss": 1.2107, "step": 466 }, { "epoch": 0.3074644062217102, "grad_norm": 1.072797179222107, "learning_rate": 0.0001627544746749663, "loss": 1.4412, "step": 467 }, { "epoch": 0.3081227882478808, "grad_norm": 1.0247933864593506, "learning_rate": 0.00016258771102061544, "loss": 1.277, "step": 468 }, { "epoch": 0.3087811702740515, "grad_norm": 1.1367268562316895, "learning_rate": 0.00016242066072643483, "loss": 1.086, "step": 469 }, { "epoch": 0.3094395523002222, "grad_norm": 1.2016199827194214, "learning_rate": 0.0001622533245574832, "loss": 1.0682, "step": 470 }, { "epoch": 0.3100979343263929, "grad_norm": 1.509184718132019, "learning_rate": 0.0001620857032801283, "loss": 1.1295, "step": 471 }, { "epoch": 0.31075631635256357, "grad_norm": 1.7264691591262817, "learning_rate": 0.0001619177976620438, "loss": 1.2027, "step": 472 }, { "epoch": 0.31141469837873426, "grad_norm": 1.6638644933700562, "learning_rate": 0.00016174960847220558, "loss": 1.0496, "step": 473 }, { "epoch": 0.31207308040490495, "grad_norm": 2.1179964542388916, "learning_rate": 0.00016158113648088808, "loss": 1.1623, "step": 474 }, { "epoch": 0.31273146243107564, "grad_norm": 2.5534677505493164, "learning_rate": 0.00016141238245966113, "loss": 1.1222, "step": 475 }, { "epoch": 0.3133898444572463, "grad_norm": 0.3433246314525604, "learning_rate": 0.00016124334718138603, "loss": 1.1649, "step": 476 }, { "epoch": 0.314048226483417, "grad_norm": 0.36493879556655884, "learning_rate": 0.00016107403142021228, "loss": 1.2114, "step": 477 }, { "epoch": 0.3147066085095877, "grad_norm": 0.3608406186103821, "learning_rate": 0.00016090443595157393, "loss": 1.1382, "step": 478 }, { "epoch": 0.3153649905357584, "grad_norm": 0.37234216928482056, "learning_rate": 0.000160734561552186, "loss": 1.2507, "step": 479 }, { "epoch": 0.3160233725619291, "grad_norm": 0.3545801639556885, "learning_rate": 0.00016056440900004092, "loss": 1.1621, "step": 480 }, { "epoch": 0.3166817545880997, "grad_norm": 0.361507385969162, "learning_rate": 0.00016039397907440512, "loss": 1.2853, "step": 481 }, { "epoch": 0.3173401366142704, "grad_norm": 0.37766724824905396, "learning_rate": 0.0001602232725558153, "loss": 1.3033, "step": 482 }, { "epoch": 0.3179985186404411, "grad_norm": 0.3873698115348816, "learning_rate": 0.00016005229022607485, "loss": 1.3018, "step": 483 }, { "epoch": 0.3186569006666118, "grad_norm": 0.41759756207466125, "learning_rate": 0.00015988103286825042, "loss": 1.2106, "step": 484 }, { "epoch": 0.3193152826927825, "grad_norm": 0.45689424872398376, "learning_rate": 0.00015970950126666816, "loss": 1.2782, "step": 485 }, { "epoch": 0.31997366471895317, "grad_norm": 0.482240229845047, "learning_rate": 0.00015953769620691022, "loss": 1.3583, "step": 486 }, { "epoch": 0.32063204674512386, "grad_norm": 0.5105469226837158, "learning_rate": 0.00015936561847581114, "loss": 1.2746, "step": 487 }, { "epoch": 0.32129042877129455, "grad_norm": 0.5805463194847107, "learning_rate": 0.00015919326886145422, "loss": 1.3807, "step": 488 }, { "epoch": 0.32194881079746523, "grad_norm": 0.5920444130897522, "learning_rate": 0.00015902064815316805, "loss": 1.2959, "step": 489 }, { "epoch": 0.3226071928236359, "grad_norm": 0.6884673237800598, "learning_rate": 0.0001588477571415226, "loss": 1.3562, "step": 490 }, { "epoch": 0.3232655748498066, "grad_norm": 0.8018052577972412, "learning_rate": 0.00015867459661832593, "loss": 1.193, "step": 491 }, { "epoch": 0.3239239568759773, "grad_norm": 0.8217924237251282, "learning_rate": 0.00015850116737662031, "loss": 1.3076, "step": 492 }, { "epoch": 0.324582338902148, "grad_norm": 0.9624382853507996, "learning_rate": 0.00015832747021067873, "loss": 1.1467, "step": 493 }, { "epoch": 0.3252407209283187, "grad_norm": 1.0854579210281372, "learning_rate": 0.00015815350591600125, "loss": 1.1437, "step": 494 }, { "epoch": 0.3258991029544893, "grad_norm": 1.3422967195510864, "learning_rate": 0.00015797927528931127, "loss": 1.3268, "step": 495 }, { "epoch": 0.32655748498066, "grad_norm": 1.4548892974853516, "learning_rate": 0.000157804779128552, "loss": 1.2196, "step": 496 }, { "epoch": 0.3272158670068307, "grad_norm": 1.4099973440170288, "learning_rate": 0.0001576300182328827, "loss": 1.1072, "step": 497 }, { "epoch": 0.3278742490330014, "grad_norm": 1.5790002346038818, "learning_rate": 0.00015745499340267508, "loss": 1.0054, "step": 498 }, { "epoch": 0.3285326310591721, "grad_norm": 1.8523013591766357, "learning_rate": 0.00015727970543950962, "loss": 0.9542, "step": 499 }, { "epoch": 0.32919101308534277, "grad_norm": 2.3925790786743164, "learning_rate": 0.00015710415514617188, "loss": 1.1818, "step": 500 }, { "epoch": 0.32984939511151345, "grad_norm": 0.27246442437171936, "learning_rate": 0.0001569283433266489, "loss": 1.0973, "step": 501 }, { "epoch": 0.33050777713768414, "grad_norm": 0.2933042645454407, "learning_rate": 0.0001567522707861254, "loss": 1.2255, "step": 502 }, { "epoch": 0.33116615916385483, "grad_norm": 0.3238910138607025, "learning_rate": 0.00015657593833098021, "loss": 1.1254, "step": 503 }, { "epoch": 0.3318245411900255, "grad_norm": 0.33603137731552124, "learning_rate": 0.0001563993467687824, "loss": 1.2374, "step": 504 }, { "epoch": 0.3324829232161962, "grad_norm": 0.3378768861293793, "learning_rate": 0.0001562224969082879, "loss": 1.2526, "step": 505 }, { "epoch": 0.3331413052423669, "grad_norm": 0.34633690118789673, "learning_rate": 0.0001560453895594354, "loss": 1.1546, "step": 506 }, { "epoch": 0.3337996872685376, "grad_norm": 0.3937011659145355, "learning_rate": 0.00015586802553334297, "loss": 1.2629, "step": 507 }, { "epoch": 0.3344580692947082, "grad_norm": 0.4210793375968933, "learning_rate": 0.00015569040564230414, "loss": 1.2708, "step": 508 }, { "epoch": 0.3351164513208789, "grad_norm": 0.453087717294693, "learning_rate": 0.00015551253069978426, "loss": 1.3762, "step": 509 }, { "epoch": 0.3357748333470496, "grad_norm": 0.4624793529510498, "learning_rate": 0.0001553344015204168, "loss": 1.3469, "step": 510 }, { "epoch": 0.3364332153732203, "grad_norm": 0.4992848038673401, "learning_rate": 0.0001551560189199996, "loss": 1.3616, "step": 511 }, { "epoch": 0.337091597399391, "grad_norm": 0.5379738807678223, "learning_rate": 0.0001549773837154911, "loss": 1.2494, "step": 512 }, { "epoch": 0.3377499794255617, "grad_norm": 0.5600094199180603, "learning_rate": 0.0001547984967250065, "loss": 1.2226, "step": 513 }, { "epoch": 0.33840836145173236, "grad_norm": 0.5964760780334473, "learning_rate": 0.00015461935876781436, "loss": 1.3194, "step": 514 }, { "epoch": 0.33906674347790305, "grad_norm": 0.695803165435791, "learning_rate": 0.00015443997066433247, "loss": 1.3516, "step": 515 }, { "epoch": 0.33972512550407374, "grad_norm": 0.7940467000007629, "learning_rate": 0.00015426033323612425, "loss": 1.2674, "step": 516 }, { "epoch": 0.34038350753024443, "grad_norm": 0.8288263082504272, "learning_rate": 0.000154080447305895, "loss": 1.1434, "step": 517 }, { "epoch": 0.3410418895564151, "grad_norm": 0.9712944030761719, "learning_rate": 0.00015390031369748818, "loss": 1.2007, "step": 518 }, { "epoch": 0.3417002715825858, "grad_norm": 1.311793327331543, "learning_rate": 0.00015371993323588142, "loss": 1.3704, "step": 519 }, { "epoch": 0.3423586536087565, "grad_norm": 1.2788738012313843, "learning_rate": 0.00015353930674718306, "loss": 1.0563, "step": 520 }, { "epoch": 0.3430170356349272, "grad_norm": 1.2688007354736328, "learning_rate": 0.00015335843505862802, "loss": 1.1603, "step": 521 }, { "epoch": 0.3436754176610978, "grad_norm": 1.6451671123504639, "learning_rate": 0.00015317731899857436, "loss": 1.2782, "step": 522 }, { "epoch": 0.3443337996872685, "grad_norm": 1.4891748428344727, "learning_rate": 0.00015299595939649918, "loss": 1.0099, "step": 523 }, { "epoch": 0.3449921817134392, "grad_norm": 2.0382041931152344, "learning_rate": 0.0001528143570829951, "loss": 1.2349, "step": 524 }, { "epoch": 0.3456505637396099, "grad_norm": 1.8571869134902954, "learning_rate": 0.0001526325128897661, "loss": 0.7098, "step": 525 }, { "epoch": 0.3463089457657806, "grad_norm": 0.26356932520866394, "learning_rate": 0.00015245042764962417, "loss": 1.1973, "step": 526 }, { "epoch": 0.3469673277919513, "grad_norm": 0.30310922861099243, "learning_rate": 0.00015226810219648505, "loss": 1.1846, "step": 527 }, { "epoch": 0.34762570981812196, "grad_norm": 0.3069896399974823, "learning_rate": 0.00015208553736536474, "loss": 1.2518, "step": 528 }, { "epoch": 0.34828409184429265, "grad_norm": 0.3580169677734375, "learning_rate": 0.0001519027339923754, "loss": 1.2043, "step": 529 }, { "epoch": 0.34894247387046334, "grad_norm": 0.33953437209129333, "learning_rate": 0.0001517196929147219, "loss": 1.2341, "step": 530 }, { "epoch": 0.34960085589663403, "grad_norm": 0.35636210441589355, "learning_rate": 0.0001515364149706975, "loss": 1.1965, "step": 531 }, { "epoch": 0.3502592379228047, "grad_norm": 0.3694870173931122, "learning_rate": 0.00015135290099968043, "loss": 1.2664, "step": 532 }, { "epoch": 0.3509176199489754, "grad_norm": 0.41831520199775696, "learning_rate": 0.0001511691518421298, "loss": 1.4323, "step": 533 }, { "epoch": 0.3515760019751461, "grad_norm": 0.45743826031684875, "learning_rate": 0.00015098516833958188, "loss": 1.3027, "step": 534 }, { "epoch": 0.3522343840013168, "grad_norm": 0.4819575548171997, "learning_rate": 0.0001508009513346461, "loss": 1.4308, "step": 535 }, { "epoch": 0.3528927660274874, "grad_norm": 0.47504669427871704, "learning_rate": 0.00015061650167100146, "loss": 1.1972, "step": 536 }, { "epoch": 0.3535511480536581, "grad_norm": 0.5313538312911987, "learning_rate": 0.0001504318201933923, "loss": 1.2737, "step": 537 }, { "epoch": 0.3542095300798288, "grad_norm": 0.5645316243171692, "learning_rate": 0.00015024690774762477, "loss": 1.2928, "step": 538 }, { "epoch": 0.3548679121059995, "grad_norm": 0.638116180896759, "learning_rate": 0.00015006176518056274, "loss": 1.2812, "step": 539 }, { "epoch": 0.3555262941321702, "grad_norm": 0.6634011268615723, "learning_rate": 0.00014987639334012397, "loss": 1.2685, "step": 540 }, { "epoch": 0.3561846761583409, "grad_norm": 0.7728289365768433, "learning_rate": 0.0001496907930752763, "loss": 1.2721, "step": 541 }, { "epoch": 0.35684305818451156, "grad_norm": 0.9165198802947998, "learning_rate": 0.00014950496523603372, "loss": 1.3018, "step": 542 }, { "epoch": 0.35750144021068225, "grad_norm": 1.189792275428772, "learning_rate": 0.00014931891067345246, "loss": 1.3033, "step": 543 }, { "epoch": 0.35815982223685294, "grad_norm": 1.1496312618255615, "learning_rate": 0.000149132630239627, "loss": 1.1322, "step": 544 }, { "epoch": 0.35881820426302363, "grad_norm": 1.1549041271209717, "learning_rate": 0.00014894612478768638, "loss": 0.9286, "step": 545 }, { "epoch": 0.3594765862891943, "grad_norm": 1.2562921047210693, "learning_rate": 0.00014875939517179016, "loss": 0.8819, "step": 546 }, { "epoch": 0.360134968315365, "grad_norm": 1.3253177404403687, "learning_rate": 0.00014857244224712455, "loss": 1.0631, "step": 547 }, { "epoch": 0.3607933503415357, "grad_norm": 1.6869429349899292, "learning_rate": 0.00014838526686989834, "loss": 1.1881, "step": 548 }, { "epoch": 0.36145173236770634, "grad_norm": 2.335425615310669, "learning_rate": 0.00014819786989733936, "loss": 1.2048, "step": 549 }, { "epoch": 0.362110114393877, "grad_norm": 2.4698996543884277, "learning_rate": 0.00014801025218769, "loss": 1.2909, "step": 550 }, { "epoch": 0.3627684964200477, "grad_norm": 0.27014511823654175, "learning_rate": 0.00014782241460020384, "loss": 1.2302, "step": 551 }, { "epoch": 0.3634268784462184, "grad_norm": 0.28544843196868896, "learning_rate": 0.00014763435799514132, "loss": 1.177, "step": 552 }, { "epoch": 0.3640852604723891, "grad_norm": 0.35948577523231506, "learning_rate": 0.000147446083233766, "loss": 1.2349, "step": 553 }, { "epoch": 0.3647436424985598, "grad_norm": 0.32412049174308777, "learning_rate": 0.00014725759117834044, "loss": 1.2907, "step": 554 }, { "epoch": 0.3654020245247305, "grad_norm": 0.33066362142562866, "learning_rate": 0.00014706888269212258, "loss": 1.2386, "step": 555 }, { "epoch": 0.36606040655090116, "grad_norm": 0.3326073884963989, "learning_rate": 0.00014687995863936135, "loss": 1.2678, "step": 556 }, { "epoch": 0.36671878857707185, "grad_norm": 0.36771395802497864, "learning_rate": 0.0001466908198852931, "loss": 1.3963, "step": 557 }, { "epoch": 0.36737717060324254, "grad_norm": 0.39641013741493225, "learning_rate": 0.00014650146729613733, "loss": 1.2419, "step": 558 }, { "epoch": 0.36803555262941323, "grad_norm": 0.4027023911476135, "learning_rate": 0.00014631190173909303, "loss": 1.345, "step": 559 }, { "epoch": 0.3686939346555839, "grad_norm": 0.4469105303287506, "learning_rate": 0.00014612212408233435, "loss": 1.4476, "step": 560 }, { "epoch": 0.3693523166817546, "grad_norm": 0.4740074574947357, "learning_rate": 0.000145932135195007, "loss": 1.3046, "step": 561 }, { "epoch": 0.3700106987079253, "grad_norm": 0.5272968411445618, "learning_rate": 0.00014574193594722395, "loss": 1.2359, "step": 562 }, { "epoch": 0.37066908073409593, "grad_norm": 0.5458040237426758, "learning_rate": 0.00014555152721006163, "loss": 1.332, "step": 563 }, { "epoch": 0.3713274627602666, "grad_norm": 0.5850545167922974, "learning_rate": 0.00014536090985555595, "loss": 1.2481, "step": 564 }, { "epoch": 0.3719858447864373, "grad_norm": 0.7109423875808716, "learning_rate": 0.0001451700847566981, "loss": 1.3476, "step": 565 }, { "epoch": 0.372644226812608, "grad_norm": 0.7439177632331848, "learning_rate": 0.00014497905278743083, "loss": 1.1857, "step": 566 }, { "epoch": 0.3733026088387787, "grad_norm": 0.9718471765518188, "learning_rate": 0.00014478781482264434, "loss": 1.3041, "step": 567 }, { "epoch": 0.3739609908649494, "grad_norm": 1.0369700193405151, "learning_rate": 0.00014459637173817213, "loss": 1.3576, "step": 568 }, { "epoch": 0.3746193728911201, "grad_norm": 1.2189229726791382, "learning_rate": 0.00014440472441078716, "loss": 1.1655, "step": 569 }, { "epoch": 0.37527775491729076, "grad_norm": 1.2079033851623535, "learning_rate": 0.00014421287371819782, "loss": 1.2692, "step": 570 }, { "epoch": 0.37593613694346145, "grad_norm": 1.1368714570999146, "learning_rate": 0.00014402082053904377, "loss": 0.9893, "step": 571 }, { "epoch": 0.37659451896963214, "grad_norm": 1.3622194528579712, "learning_rate": 0.00014382856575289224, "loss": 1.0648, "step": 572 }, { "epoch": 0.37725290099580283, "grad_norm": 1.4774643182754517, "learning_rate": 0.0001436361102402335, "loss": 0.9496, "step": 573 }, { "epoch": 0.3779112830219735, "grad_norm": 1.825216293334961, "learning_rate": 0.00014344345488247732, "loss": 0.7552, "step": 574 }, { "epoch": 0.3785696650481442, "grad_norm": 2.255516529083252, "learning_rate": 0.00014325060056194863, "loss": 0.9479, "step": 575 }, { "epoch": 0.37922804707431484, "grad_norm": 0.28654685616493225, "learning_rate": 0.00014305754816188357, "loss": 1.2666, "step": 576 }, { "epoch": 0.37988642910048553, "grad_norm": 0.2982519567012787, "learning_rate": 0.00014286429856642552, "loss": 1.2897, "step": 577 }, { "epoch": 0.3805448111266562, "grad_norm": 0.30523499846458435, "learning_rate": 0.0001426708526606209, "loss": 1.2694, "step": 578 }, { "epoch": 0.3812031931528269, "grad_norm": 0.3089349865913391, "learning_rate": 0.00014247721133041515, "loss": 1.2022, "step": 579 }, { "epoch": 0.3818615751789976, "grad_norm": 0.31746774911880493, "learning_rate": 0.0001422833754626489, "loss": 1.2587, "step": 580 }, { "epoch": 0.3825199572051683, "grad_norm": 0.3328789472579956, "learning_rate": 0.00014208934594505353, "loss": 1.2305, "step": 581 }, { "epoch": 0.383178339231339, "grad_norm": 0.3604884743690491, "learning_rate": 0.00014189512366624744, "loss": 1.2699, "step": 582 }, { "epoch": 0.38383672125750967, "grad_norm": 0.36411353945732117, "learning_rate": 0.0001417007095157317, "loss": 1.2266, "step": 583 }, { "epoch": 0.38449510328368036, "grad_norm": 0.4087217450141907, "learning_rate": 0.0001415061043838863, "loss": 1.2915, "step": 584 }, { "epoch": 0.38515348530985105, "grad_norm": 0.40117326378822327, "learning_rate": 0.0001413113091619657, "loss": 1.3334, "step": 585 }, { "epoch": 0.38581186733602174, "grad_norm": 0.45401015877723694, "learning_rate": 0.00014111632474209505, "loss": 1.2421, "step": 586 }, { "epoch": 0.38647024936219243, "grad_norm": 0.5087395310401917, "learning_rate": 0.00014092115201726598, "loss": 1.3305, "step": 587 }, { "epoch": 0.3871286313883631, "grad_norm": 0.5056115984916687, "learning_rate": 0.00014072579188133247, "loss": 1.2384, "step": 588 }, { "epoch": 0.3877870134145338, "grad_norm": 0.5636983513832092, "learning_rate": 0.00014053024522900684, "loss": 1.2972, "step": 589 }, { "epoch": 0.38844539544070444, "grad_norm": 0.5854564309120178, "learning_rate": 0.00014033451295585565, "loss": 1.2651, "step": 590 }, { "epoch": 0.38910377746687513, "grad_norm": 0.647421658039093, "learning_rate": 0.0001401385959582955, "loss": 1.3279, "step": 591 }, { "epoch": 0.3897621594930458, "grad_norm": 0.7400067448616028, "learning_rate": 0.00013994249513358905, "loss": 1.2224, "step": 592 }, { "epoch": 0.3904205415192165, "grad_norm": 0.8371015191078186, "learning_rate": 0.00013974621137984085, "loss": 1.2058, "step": 593 }, { "epoch": 0.3910789235453872, "grad_norm": 0.9347033500671387, "learning_rate": 0.0001395497455959932, "loss": 1.4082, "step": 594 }, { "epoch": 0.3917373055715579, "grad_norm": 1.1436898708343506, "learning_rate": 0.00013935309868182204, "loss": 1.1705, "step": 595 }, { "epoch": 0.3923956875977286, "grad_norm": 1.247821569442749, "learning_rate": 0.00013915627153793293, "loss": 1.0414, "step": 596 }, { "epoch": 0.39305406962389927, "grad_norm": 1.3201680183410645, "learning_rate": 0.00013895926506575683, "loss": 1.0165, "step": 597 }, { "epoch": 0.39371245165006996, "grad_norm": 1.7205967903137207, "learning_rate": 0.00013876208016754587, "loss": 1.1328, "step": 598 }, { "epoch": 0.39437083367624065, "grad_norm": 1.5844374895095825, "learning_rate": 0.0001385647177463696, "loss": 1.0601, "step": 599 }, { "epoch": 0.39502921570241134, "grad_norm": 1.9543534517288208, "learning_rate": 0.00013836717870611024, "loss": 1.0686, "step": 600 }, { "epoch": 0.39568759772858203, "grad_norm": 0.28580281138420105, "learning_rate": 0.0001381694639514592, "loss": 1.1646, "step": 601 }, { "epoch": 0.3963459797547527, "grad_norm": 0.3075709044933319, "learning_rate": 0.00013797157438791245, "loss": 1.2851, "step": 602 }, { "epoch": 0.39700436178092335, "grad_norm": 0.31908196210861206, "learning_rate": 0.00013777351092176664, "loss": 1.2926, "step": 603 }, { "epoch": 0.39766274380709404, "grad_norm": 0.3248385787010193, "learning_rate": 0.0001375752744601148, "loss": 1.2514, "step": 604 }, { "epoch": 0.39832112583326473, "grad_norm": 0.32760292291641235, "learning_rate": 0.0001373768659108423, "loss": 1.1686, "step": 605 }, { "epoch": 0.3989795078594354, "grad_norm": 0.35268697142601013, "learning_rate": 0.0001371782861826226, "loss": 1.2315, "step": 606 }, { "epoch": 0.3996378898856061, "grad_norm": 0.3568597435951233, "learning_rate": 0.00013697953618491314, "loss": 1.2494, "step": 607 }, { "epoch": 0.4002962719117768, "grad_norm": 0.3915829062461853, "learning_rate": 0.00013678061682795119, "loss": 1.279, "step": 608 }, { "epoch": 0.4009546539379475, "grad_norm": 0.4155282974243164, "learning_rate": 0.00013658152902274958, "loss": 1.329, "step": 609 }, { "epoch": 0.4016130359641182, "grad_norm": 0.43656131625175476, "learning_rate": 0.00013638227368109268, "loss": 1.4025, "step": 610 }, { "epoch": 0.40227141799028887, "grad_norm": 0.493294894695282, "learning_rate": 0.00013618285171553208, "loss": 1.3246, "step": 611 }, { "epoch": 0.40292980001645956, "grad_norm": 0.5095138549804688, "learning_rate": 0.00013598326403938256, "loss": 1.3534, "step": 612 }, { "epoch": 0.40358818204263025, "grad_norm": 0.5601783394813538, "learning_rate": 0.00013578351156671775, "loss": 1.2156, "step": 613 }, { "epoch": 0.40424656406880094, "grad_norm": 0.5981459617614746, "learning_rate": 0.000135583595212366, "loss": 1.3605, "step": 614 }, { "epoch": 0.40490494609497163, "grad_norm": 0.6983335614204407, "learning_rate": 0.00013538351589190625, "loss": 1.311, "step": 615 }, { "epoch": 0.4055633281211423, "grad_norm": 0.7210850119590759, "learning_rate": 0.00013518327452166385, "loss": 1.2873, "step": 616 }, { "epoch": 0.40622171014731295, "grad_norm": 0.8635502457618713, "learning_rate": 0.00013498287201870618, "loss": 1.1216, "step": 617 }, { "epoch": 0.40688009217348364, "grad_norm": 1.026448369026184, "learning_rate": 0.0001347823093008387, "loss": 1.3648, "step": 618 }, { "epoch": 0.40753847419965433, "grad_norm": 1.4044097661972046, "learning_rate": 0.00013458158728660047, "loss": 1.3143, "step": 619 }, { "epoch": 0.408196856225825, "grad_norm": 1.2657238245010376, "learning_rate": 0.00013438070689526032, "loss": 0.9066, "step": 620 }, { "epoch": 0.4088552382519957, "grad_norm": 1.2726161479949951, "learning_rate": 0.00013417966904681218, "loss": 1.1294, "step": 621 }, { "epoch": 0.4095136202781664, "grad_norm": 1.3671187162399292, "learning_rate": 0.00013397847466197133, "loss": 0.9815, "step": 622 }, { "epoch": 0.4101720023043371, "grad_norm": 1.6965669393539429, "learning_rate": 0.0001337771246621697, "loss": 0.9587, "step": 623 }, { "epoch": 0.4108303843305078, "grad_norm": 1.523437738418579, "learning_rate": 0.00013357561996955218, "loss": 0.8888, "step": 624 }, { "epoch": 0.41148876635667847, "grad_norm": 2.551095962524414, "learning_rate": 0.0001333739615069719, "loss": 1.4718, "step": 625 }, { "epoch": 0.41214714838284916, "grad_norm": 0.2869875729084015, "learning_rate": 0.00013317215019798638, "loss": 1.1899, "step": 626 }, { "epoch": 0.41280553040901985, "grad_norm": 0.3287839889526367, "learning_rate": 0.00013297018696685307, "loss": 1.2037, "step": 627 }, { "epoch": 0.41346391243519054, "grad_norm": 0.28788018226623535, "learning_rate": 0.0001327680727385252, "loss": 1.1815, "step": 628 }, { "epoch": 0.41412229446136123, "grad_norm": 0.3081077039241791, "learning_rate": 0.0001325658084386475, "loss": 1.2285, "step": 629 }, { "epoch": 0.41478067648753186, "grad_norm": 0.34519800543785095, "learning_rate": 0.00013236339499355217, "loss": 1.2773, "step": 630 }, { "epoch": 0.41543905851370255, "grad_norm": 0.4046002924442291, "learning_rate": 0.00013216083333025423, "loss": 1.1607, "step": 631 }, { "epoch": 0.41609744053987324, "grad_norm": 0.3767573833465576, "learning_rate": 0.0001319581243764477, "loss": 1.2453, "step": 632 }, { "epoch": 0.41675582256604393, "grad_norm": 0.4387420117855072, "learning_rate": 0.00013175526906050107, "loss": 1.33, "step": 633 }, { "epoch": 0.4174142045922146, "grad_norm": 0.3840062618255615, "learning_rate": 0.00013155226831145316, "loss": 1.0993, "step": 634 }, { "epoch": 0.4180725866183853, "grad_norm": 0.4738592803478241, "learning_rate": 0.00013134912305900882, "loss": 1.2066, "step": 635 }, { "epoch": 0.418730968644556, "grad_norm": 0.458029180765152, "learning_rate": 0.00013114583423353476, "loss": 1.3848, "step": 636 }, { "epoch": 0.4193893506707267, "grad_norm": 0.48436805605888367, "learning_rate": 0.00013094240276605515, "loss": 1.2229, "step": 637 }, { "epoch": 0.4200477326968974, "grad_norm": 0.5420469045639038, "learning_rate": 0.00013073882958824755, "loss": 1.3983, "step": 638 }, { "epoch": 0.42070611472306807, "grad_norm": 0.5722162127494812, "learning_rate": 0.00013053511563243837, "loss": 1.2612, "step": 639 }, { "epoch": 0.42136449674923876, "grad_norm": 0.6378898620605469, "learning_rate": 0.00013033126183159888, "loss": 1.3559, "step": 640 }, { "epoch": 0.42202287877540945, "grad_norm": 0.6519699692726135, "learning_rate": 0.00013012726911934075, "loss": 1.1572, "step": 641 }, { "epoch": 0.42268126080158014, "grad_norm": 0.793175220489502, "learning_rate": 0.00012992313842991187, "loss": 1.1228, "step": 642 }, { "epoch": 0.42333964282775083, "grad_norm": 1.0762929916381836, "learning_rate": 0.0001297188706981921, "loss": 1.3597, "step": 643 }, { "epoch": 0.42399802485392146, "grad_norm": 1.254856824874878, "learning_rate": 0.00012951446685968872, "loss": 1.24, "step": 644 }, { "epoch": 0.42465640688009215, "grad_norm": 1.163009524345398, "learning_rate": 0.00012930992785053258, "loss": 1.1964, "step": 645 }, { "epoch": 0.42531478890626284, "grad_norm": 1.3290621042251587, "learning_rate": 0.00012910525460747344, "loss": 1.1764, "step": 646 }, { "epoch": 0.42597317093243353, "grad_norm": 1.4044946432113647, "learning_rate": 0.00012890044806787598, "loss": 1.2975, "step": 647 }, { "epoch": 0.4266315529586042, "grad_norm": 1.8088845014572144, "learning_rate": 0.0001286955091697151, "loss": 1.5852, "step": 648 }, { "epoch": 0.4272899349847749, "grad_norm": 2.150536298751831, "learning_rate": 0.00012849043885157223, "loss": 0.9814, "step": 649 }, { "epoch": 0.4279483170109456, "grad_norm": 2.130786418914795, "learning_rate": 0.0001282852380526303, "loss": 1.0403, "step": 650 }, { "epoch": 0.4286066990371163, "grad_norm": 0.26838168501853943, "learning_rate": 0.00012807990771267008, "loss": 1.2076, "step": 651 }, { "epoch": 0.429265081063287, "grad_norm": 0.28282177448272705, "learning_rate": 0.0001278744487720655, "loss": 1.2995, "step": 652 }, { "epoch": 0.42992346308945767, "grad_norm": 0.30625301599502563, "learning_rate": 0.00012766886217177955, "loss": 1.2716, "step": 653 }, { "epoch": 0.43058184511562836, "grad_norm": 0.30854368209838867, "learning_rate": 0.0001274631488533597, "loss": 1.3002, "step": 654 }, { "epoch": 0.43124022714179905, "grad_norm": 0.34010156989097595, "learning_rate": 0.00012725730975893403, "loss": 1.2389, "step": 655 }, { "epoch": 0.43189860916796974, "grad_norm": 0.37646082043647766, "learning_rate": 0.00012705134583120638, "loss": 1.3756, "step": 656 }, { "epoch": 0.43255699119414037, "grad_norm": 0.3462966978549957, "learning_rate": 0.00012684525801345245, "loss": 1.2156, "step": 657 }, { "epoch": 0.43321537322031106, "grad_norm": 0.3970300257205963, "learning_rate": 0.00012663904724951529, "loss": 1.2378, "step": 658 }, { "epoch": 0.43387375524648175, "grad_norm": 0.41946786642074585, "learning_rate": 0.00012643271448380111, "loss": 1.3497, "step": 659 }, { "epoch": 0.43453213727265244, "grad_norm": 0.4470922350883484, "learning_rate": 0.0001262262606612747, "loss": 1.3033, "step": 660 }, { "epoch": 0.43519051929882313, "grad_norm": 0.5025902986526489, "learning_rate": 0.00012601968672745542, "loss": 1.3269, "step": 661 }, { "epoch": 0.4358489013249938, "grad_norm": 0.5296264886856079, "learning_rate": 0.00012581299362841262, "loss": 1.3694, "step": 662 }, { "epoch": 0.4365072833511645, "grad_norm": 0.5374211668968201, "learning_rate": 0.00012560618231076143, "loss": 1.2149, "step": 663 }, { "epoch": 0.4371656653773352, "grad_norm": 0.6261985898017883, "learning_rate": 0.00012539925372165845, "loss": 1.3619, "step": 664 }, { "epoch": 0.4378240474035059, "grad_norm": 0.6814325451850891, "learning_rate": 0.00012519220880879728, "loss": 1.1848, "step": 665 }, { "epoch": 0.4384824294296766, "grad_norm": 0.7937935590744019, "learning_rate": 0.00012498504852040434, "loss": 1.2499, "step": 666 }, { "epoch": 0.43914081145584727, "grad_norm": 0.8309380412101746, "learning_rate": 0.0001247777738052343, "loss": 1.1888, "step": 667 }, { "epoch": 0.43979919348201796, "grad_norm": 1.048019528388977, "learning_rate": 0.00012457038561256617, "loss": 1.2518, "step": 668 }, { "epoch": 0.44045757550818865, "grad_norm": 1.5039113759994507, "learning_rate": 0.00012436288489219832, "loss": 1.1941, "step": 669 }, { "epoch": 0.44111595753435934, "grad_norm": 1.160756230354309, "learning_rate": 0.00012415527259444472, "loss": 1.1634, "step": 670 }, { "epoch": 0.44177433956052997, "grad_norm": 1.244675636291504, "learning_rate": 0.0001239475496701302, "loss": 1.0483, "step": 671 }, { "epoch": 0.44243272158670066, "grad_norm": 1.5386816263198853, "learning_rate": 0.00012373971707058644, "loss": 1.1332, "step": 672 }, { "epoch": 0.44309110361287135, "grad_norm": 1.492594838142395, "learning_rate": 0.00012353177574764708, "loss": 1.0486, "step": 673 }, { "epoch": 0.44374948563904204, "grad_norm": 1.7617744207382202, "learning_rate": 0.00012332372665364407, "loss": 0.8815, "step": 674 }, { "epoch": 0.44440786766521273, "grad_norm": 2.5818214416503906, "learning_rate": 0.0001231155707414026, "loss": 1.2417, "step": 675 }, { "epoch": 0.4450662496913834, "grad_norm": 0.28840380907058716, "learning_rate": 0.0001229073089642373, "loss": 1.0849, "step": 676 }, { "epoch": 0.4457246317175541, "grad_norm": 0.3231571912765503, "learning_rate": 0.00012269894227594759, "loss": 1.2667, "step": 677 }, { "epoch": 0.4463830137437248, "grad_norm": 0.3091655969619751, "learning_rate": 0.00012249047163081324, "loss": 1.2213, "step": 678 }, { "epoch": 0.4470413957698955, "grad_norm": 0.3587014973163605, "learning_rate": 0.00012228189798359023, "loss": 1.2447, "step": 679 }, { "epoch": 0.4476997777960662, "grad_norm": 0.341579407453537, "learning_rate": 0.0001220732222895063, "loss": 1.2339, "step": 680 }, { "epoch": 0.44835815982223687, "grad_norm": 0.3748099207878113, "learning_rate": 0.00012186444550425643, "loss": 1.3603, "step": 681 }, { "epoch": 0.44901654184840756, "grad_norm": 0.3814949691295624, "learning_rate": 0.00012165556858399873, "loss": 1.2162, "step": 682 }, { "epoch": 0.44967492387457825, "grad_norm": 0.46470388770103455, "learning_rate": 0.00012144659248534972, "loss": 1.2451, "step": 683 }, { "epoch": 0.45033330590074894, "grad_norm": 0.3947675824165344, "learning_rate": 0.00012123751816538036, "loss": 1.2248, "step": 684 }, { "epoch": 0.45099168792691957, "grad_norm": 0.5034338235855103, "learning_rate": 0.00012102834658161122, "loss": 1.3963, "step": 685 }, { "epoch": 0.45165006995309026, "grad_norm": 0.46241188049316406, "learning_rate": 0.00012081907869200849, "loss": 1.3473, "step": 686 }, { "epoch": 0.45230845197926095, "grad_norm": 0.5000467300415039, "learning_rate": 0.00012060971545497937, "loss": 1.3853, "step": 687 }, { "epoch": 0.45296683400543164, "grad_norm": 0.5979068279266357, "learning_rate": 0.00012040025782936766, "loss": 1.1959, "step": 688 }, { "epoch": 0.45362521603160233, "grad_norm": 0.5908235907554626, "learning_rate": 0.00012019070677444955, "loss": 1.3634, "step": 689 }, { "epoch": 0.454283598057773, "grad_norm": 0.5893526077270508, "learning_rate": 0.00011998106324992906, "loss": 1.256, "step": 690 }, { "epoch": 0.4549419800839437, "grad_norm": 0.68131023645401, "learning_rate": 0.00011977132821593374, "loss": 1.3786, "step": 691 }, { "epoch": 0.4556003621101144, "grad_norm": 0.866544246673584, "learning_rate": 0.00011956150263301014, "loss": 1.2843, "step": 692 }, { "epoch": 0.4562587441362851, "grad_norm": 0.9520693421363831, "learning_rate": 0.0001193515874621197, "loss": 1.1041, "step": 693 }, { "epoch": 0.4569171261624558, "grad_norm": 1.0151017904281616, "learning_rate": 0.00011914158366463392, "loss": 1.0524, "step": 694 }, { "epoch": 0.45757550818862647, "grad_norm": 1.2233302593231201, "learning_rate": 0.0001189314922023304, "loss": 1.2917, "step": 695 }, { "epoch": 0.45823389021479716, "grad_norm": 1.3857312202453613, "learning_rate": 0.00011872131403738807, "loss": 1.0784, "step": 696 }, { "epoch": 0.45889227224096785, "grad_norm": 1.437481164932251, "learning_rate": 0.00011851105013238304, "loss": 1.2381, "step": 697 }, { "epoch": 0.4595506542671385, "grad_norm": 1.544447422027588, "learning_rate": 0.00011830070145028402, "loss": 1.162, "step": 698 }, { "epoch": 0.46020903629330917, "grad_norm": 1.5430525541305542, "learning_rate": 0.0001180902689544481, "loss": 0.9578, "step": 699 }, { "epoch": 0.46086741831947986, "grad_norm": 1.6765786409378052, "learning_rate": 0.00011787975360861608, "loss": 0.9626, "step": 700 }, { "epoch": 0.46152580034565055, "grad_norm": 0.28516799211502075, "learning_rate": 0.00011766915637690828, "loss": 1.1405, "step": 701 }, { "epoch": 0.46218418237182124, "grad_norm": 0.3133491277694702, "learning_rate": 0.00011745847822382004, "loss": 1.2342, "step": 702 }, { "epoch": 0.46284256439799193, "grad_norm": 0.3051530122756958, "learning_rate": 0.00011724772011421728, "loss": 1.1976, "step": 703 }, { "epoch": 0.4635009464241626, "grad_norm": 0.3280916213989258, "learning_rate": 0.00011703688301333211, "loss": 1.2157, "step": 704 }, { "epoch": 0.4641593284503333, "grad_norm": 0.348223477602005, "learning_rate": 0.00011682596788675839, "loss": 1.2648, "step": 705 }, { "epoch": 0.464817710476504, "grad_norm": 0.3523666262626648, "learning_rate": 0.00011661497570044738, "loss": 1.2263, "step": 706 }, { "epoch": 0.4654760925026747, "grad_norm": 0.3416306972503662, "learning_rate": 0.00011640390742070322, "loss": 1.1994, "step": 707 }, { "epoch": 0.4661344745288454, "grad_norm": 0.3957153856754303, "learning_rate": 0.00011619276401417849, "loss": 1.2677, "step": 708 }, { "epoch": 0.46679285655501607, "grad_norm": 0.4453893303871155, "learning_rate": 0.00011598154644786999, "loss": 1.3703, "step": 709 }, { "epoch": 0.46745123858118676, "grad_norm": 0.43442729115486145, "learning_rate": 0.00011577025568911395, "loss": 1.2058, "step": 710 }, { "epoch": 0.46810962060735745, "grad_norm": 0.5079056024551392, "learning_rate": 0.00011555889270558198, "loss": 1.2857, "step": 711 }, { "epoch": 0.4687680026335281, "grad_norm": 0.5519406199455261, "learning_rate": 0.00011534745846527641, "loss": 1.3221, "step": 712 }, { "epoch": 0.46942638465969877, "grad_norm": 0.55561763048172, "learning_rate": 0.00011513595393652586, "loss": 1.2663, "step": 713 }, { "epoch": 0.47008476668586946, "grad_norm": 0.698628842830658, "learning_rate": 0.00011492438008798092, "loss": 1.3198, "step": 714 }, { "epoch": 0.47074314871204015, "grad_norm": 0.6890379190444946, "learning_rate": 0.00011471273788860965, "loss": 1.2556, "step": 715 }, { "epoch": 0.47140153073821084, "grad_norm": 0.7377007007598877, "learning_rate": 0.00011450102830769314, "loss": 1.1379, "step": 716 }, { "epoch": 0.47205991276438153, "grad_norm": 0.8921574950218201, "learning_rate": 0.00011428925231482099, "loss": 1.2626, "step": 717 }, { "epoch": 0.4727182947905522, "grad_norm": 1.0351232290267944, "learning_rate": 0.00011407741087988712, "loss": 1.2653, "step": 718 }, { "epoch": 0.4733766768167229, "grad_norm": 1.1510781049728394, "learning_rate": 0.00011386550497308502, "loss": 1.1783, "step": 719 }, { "epoch": 0.4740350588428936, "grad_norm": 1.2471855878829956, "learning_rate": 0.00011365353556490348, "loss": 1.0714, "step": 720 }, { "epoch": 0.4746934408690643, "grad_norm": 1.3320882320404053, "learning_rate": 0.00011344150362612216, "loss": 1.0088, "step": 721 }, { "epoch": 0.475351822895235, "grad_norm": 1.3665803670883179, "learning_rate": 0.00011322941012780708, "loss": 0.835, "step": 722 }, { "epoch": 0.47601020492140567, "grad_norm": 1.5143336057662964, "learning_rate": 0.00011301725604130612, "loss": 0.9099, "step": 723 }, { "epoch": 0.47666858694757636, "grad_norm": 2.5366203784942627, "learning_rate": 0.00011280504233824481, "loss": 1.2175, "step": 724 }, { "epoch": 0.477326968973747, "grad_norm": 2.4705569744110107, "learning_rate": 0.0001125927699905215, "loss": 0.9952, "step": 725 }, { "epoch": 0.4779853509999177, "grad_norm": 0.34162166714668274, "learning_rate": 0.00011238043997030329, "loss": 1.1196, "step": 726 }, { "epoch": 0.47864373302608837, "grad_norm": 0.3529001772403717, "learning_rate": 0.00011216805325002134, "loss": 1.1777, "step": 727 }, { "epoch": 0.47930211505225906, "grad_norm": 0.35721468925476074, "learning_rate": 0.0001119556108023665, "loss": 1.1834, "step": 728 }, { "epoch": 0.47996049707842975, "grad_norm": 0.3928389847278595, "learning_rate": 0.00011174311360028478, "loss": 1.2291, "step": 729 }, { "epoch": 0.48061887910460044, "grad_norm": 0.3451043963432312, "learning_rate": 0.00011153056261697303, "loss": 1.3377, "step": 730 }, { "epoch": 0.48127726113077113, "grad_norm": 0.38403835892677307, "learning_rate": 0.00011131795882587442, "loss": 1.25, "step": 731 }, { "epoch": 0.4819356431569418, "grad_norm": 0.386591374874115, "learning_rate": 0.00011110530320067393, "loss": 1.3043, "step": 732 }, { "epoch": 0.4825940251831125, "grad_norm": 0.3729150593280792, "learning_rate": 0.00011089259671529388, "loss": 1.2401, "step": 733 }, { "epoch": 0.4832524072092832, "grad_norm": 0.4154976010322571, "learning_rate": 0.00011067984034388962, "loss": 1.3669, "step": 734 }, { "epoch": 0.4839107892354539, "grad_norm": 0.45570993423461914, "learning_rate": 0.00011046703506084495, "loss": 1.2276, "step": 735 }, { "epoch": 0.4845691712616246, "grad_norm": 0.4452035129070282, "learning_rate": 0.00011025418184076759, "loss": 1.2317, "step": 736 }, { "epoch": 0.48522755328779527, "grad_norm": 0.4917307198047638, "learning_rate": 0.0001100412816584849, "loss": 1.2464, "step": 737 }, { "epoch": 0.48588593531396596, "grad_norm": 0.5210979580879211, "learning_rate": 0.00010982833548903925, "loss": 1.3765, "step": 738 }, { "epoch": 0.4865443173401366, "grad_norm": 0.5611717104911804, "learning_rate": 0.0001096153443076837, "loss": 1.274, "step": 739 }, { "epoch": 0.4872026993663073, "grad_norm": 0.6100170612335205, "learning_rate": 0.00010940230908987736, "loss": 1.4141, "step": 740 }, { "epoch": 0.48786108139247797, "grad_norm": 0.6673282980918884, "learning_rate": 0.00010918923081128114, "loss": 1.3532, "step": 741 }, { "epoch": 0.48851946341864866, "grad_norm": 0.8000884652137756, "learning_rate": 0.00010897611044775298, "loss": 1.3492, "step": 742 }, { "epoch": 0.48917784544481935, "grad_norm": 0.8881794214248657, "learning_rate": 0.00010876294897534375, "loss": 1.2666, "step": 743 }, { "epoch": 0.48983622747099004, "grad_norm": 1.1199326515197754, "learning_rate": 0.00010854974737029249, "loss": 1.3141, "step": 744 }, { "epoch": 0.4904946094971607, "grad_norm": 1.334073543548584, "learning_rate": 0.00010833650660902204, "loss": 1.1719, "step": 745 }, { "epoch": 0.4911529915233314, "grad_norm": 1.2162492275238037, "learning_rate": 0.00010812322766813461, "loss": 0.8227, "step": 746 }, { "epoch": 0.4918113735495021, "grad_norm": 1.8882014751434326, "learning_rate": 0.00010790991152440726, "loss": 1.0215, "step": 747 }, { "epoch": 0.4924697555756728, "grad_norm": 1.4212284088134766, "learning_rate": 0.00010769655915478734, "loss": 1.0832, "step": 748 }, { "epoch": 0.4931281376018435, "grad_norm": 1.5909987688064575, "learning_rate": 0.0001074831715363883, "loss": 0.8689, "step": 749 }, { "epoch": 0.4937865196280142, "grad_norm": 2.5371243953704834, "learning_rate": 0.00010726974964648477, "loss": 0.9634, "step": 750 }, { "epoch": 0.49444490165418487, "grad_norm": 0.281331330537796, "learning_rate": 0.0001070562944625086, "loss": 1.223, "step": 751 }, { "epoch": 0.4951032836803555, "grad_norm": 0.2999142110347748, "learning_rate": 0.00010684280696204389, "loss": 1.2036, "step": 752 }, { "epoch": 0.4957616657065262, "grad_norm": 0.29755231738090515, "learning_rate": 0.00010662928812282293, "loss": 1.2743, "step": 753 }, { "epoch": 0.4964200477326969, "grad_norm": 0.3215511441230774, "learning_rate": 0.00010641573892272138, "loss": 1.207, "step": 754 }, { "epoch": 0.49707842975886757, "grad_norm": 0.3407268524169922, "learning_rate": 0.00010620216033975407, "loss": 1.2549, "step": 755 }, { "epoch": 0.49773681178503826, "grad_norm": 0.3472555875778198, "learning_rate": 0.00010598855335207032, "loss": 1.2366, "step": 756 }, { "epoch": 0.49839519381120895, "grad_norm": 0.36017248034477234, "learning_rate": 0.0001057749189379496, "loss": 1.2103, "step": 757 }, { "epoch": 0.49905357583737964, "grad_norm": 0.37401172518730164, "learning_rate": 0.00010556125807579691, "loss": 1.2066, "step": 758 }, { "epoch": 0.4997119578635503, "grad_norm": 0.40642622113227844, "learning_rate": 0.00010534757174413844, "loss": 1.1629, "step": 759 }, { "epoch": 0.500370339889721, "grad_norm": 0.4270761013031006, "learning_rate": 0.00010513386092161697, "loss": 1.2677, "step": 760 }, { "epoch": 0.500370339889721, "eval_loss": 1.2045371532440186, "eval_runtime": 142.955, "eval_samples_per_second": 17.894, "eval_steps_per_second": 4.477, "step": 760 }, { "epoch": 0.5010287219158917, "grad_norm": 0.4733821153640747, "learning_rate": 0.00010492012658698753, "loss": 1.3759, "step": 761 }, { "epoch": 0.5016871039420624, "grad_norm": 0.5020653009414673, "learning_rate": 0.00010470636971911276, "loss": 1.2907, "step": 762 }, { "epoch": 0.502345485968233, "grad_norm": 0.5274090766906738, "learning_rate": 0.00010449259129695851, "loss": 1.3062, "step": 763 }, { "epoch": 0.5030038679944038, "grad_norm": 0.5507718324661255, "learning_rate": 0.00010427879229958939, "loss": 1.2441, "step": 764 }, { "epoch": 0.5036622500205744, "grad_norm": 0.5971547365188599, "learning_rate": 0.00010406497370616414, "loss": 1.3003, "step": 765 }, { "epoch": 0.5043206320467452, "grad_norm": 0.7156444787979126, "learning_rate": 0.00010385113649593137, "loss": 1.266, "step": 766 }, { "epoch": 0.5049790140729158, "grad_norm": 0.7694118618965149, "learning_rate": 0.00010363728164822481, "loss": 1.3073, "step": 767 }, { "epoch": 0.5056373960990865, "grad_norm": 1.0051273107528687, "learning_rate": 0.00010342341014245918, "loss": 1.1993, "step": 768 }, { "epoch": 0.5062957781252572, "grad_norm": 0.9558060765266418, "learning_rate": 0.00010320952295812523, "loss": 1.087, "step": 769 }, { "epoch": 0.5069541601514279, "grad_norm": 1.224137783050537, "learning_rate": 0.00010299562107478569, "loss": 1.0047, "step": 770 }, { "epoch": 0.5076125421775985, "grad_norm": 1.2515257596969604, "learning_rate": 0.00010278170547207055, "loss": 0.9209, "step": 771 }, { "epoch": 0.5082709242037693, "grad_norm": 1.3540955781936646, "learning_rate": 0.00010256777712967269, "loss": 1.1832, "step": 772 }, { "epoch": 0.5089293062299399, "grad_norm": 1.5469887256622314, "learning_rate": 0.00010235383702734323, "loss": 1.1397, "step": 773 }, { "epoch": 0.5095876882561106, "grad_norm": 1.4079976081848145, "learning_rate": 0.00010213988614488721, "loss": 0.6813, "step": 774 }, { "epoch": 0.5102460702822813, "grad_norm": 1.7596197128295898, "learning_rate": 0.0001019259254621591, "loss": 0.9903, "step": 775 }, { "epoch": 0.5109044523084519, "grad_norm": 0.29065272212028503, "learning_rate": 0.00010171195595905812, "loss": 1.2383, "step": 776 }, { "epoch": 0.5115628343346227, "grad_norm": 0.3439897298812866, "learning_rate": 0.00010149797861552396, "loss": 1.1834, "step": 777 }, { "epoch": 0.5122212163607933, "grad_norm": 0.3276394307613373, "learning_rate": 0.00010128399441153228, "loss": 1.2489, "step": 778 }, { "epoch": 0.5128795983869641, "grad_norm": 0.34532174468040466, "learning_rate": 0.00010107000432709006, "loss": 1.1921, "step": 779 }, { "epoch": 0.5135379804131347, "grad_norm": 0.34875309467315674, "learning_rate": 0.00010085600934223121, "loss": 1.2439, "step": 780 }, { "epoch": 0.5141963624393054, "grad_norm": 0.3643695116043091, "learning_rate": 0.00010064201043701215, "loss": 1.2529, "step": 781 }, { "epoch": 0.5148547444654761, "grad_norm": 0.3640288710594177, "learning_rate": 0.00010042800859150725, "loss": 1.2328, "step": 782 }, { "epoch": 0.5155131264916468, "grad_norm": 0.37019646167755127, "learning_rate": 0.00010021400478580427, "loss": 1.1781, "step": 783 }, { "epoch": 0.5161715085178175, "grad_norm": 0.40062573552131653, "learning_rate": 0.0001, "loss": 1.1787, "step": 784 }, { "epoch": 0.5168298905439882, "grad_norm": 0.46246856451034546, "learning_rate": 9.978599521419574e-05, "loss": 1.3249, "step": 785 }, { "epoch": 0.5174882725701588, "grad_norm": 0.4728947579860687, "learning_rate": 9.957199140849278e-05, "loss": 1.4279, "step": 786 }, { "epoch": 0.5181466545963295, "grad_norm": 0.501487672328949, "learning_rate": 9.935798956298786e-05, "loss": 1.2406, "step": 787 }, { "epoch": 0.5188050366225002, "grad_norm": 0.5680145621299744, "learning_rate": 9.914399065776879e-05, "loss": 1.3382, "step": 788 }, { "epoch": 0.5194634186486708, "grad_norm": 0.6264930963516235, "learning_rate": 9.892999567290997e-05, "loss": 1.2645, "step": 789 }, { "epoch": 0.5201218006748416, "grad_norm": 0.6983153223991394, "learning_rate": 9.871600558846773e-05, "loss": 1.1803, "step": 790 }, { "epoch": 0.5207801827010122, "grad_norm": 0.7003601789474487, "learning_rate": 9.850202138447603e-05, "loss": 1.195, "step": 791 }, { "epoch": 0.521438564727183, "grad_norm": 0.936095118522644, "learning_rate": 9.828804404094192e-05, "loss": 1.2945, "step": 792 }, { "epoch": 0.5220969467533536, "grad_norm": 1.003400444984436, "learning_rate": 9.807407453784094e-05, "loss": 1.2078, "step": 793 }, { "epoch": 0.5227553287795244, "grad_norm": 1.1687790155410767, "learning_rate": 9.786011385511279e-05, "loss": 1.1849, "step": 794 }, { "epoch": 0.523413710805695, "grad_norm": 1.2150942087173462, "learning_rate": 9.764616297265682e-05, "loss": 1.195, "step": 795 }, { "epoch": 0.5240720928318657, "grad_norm": 1.3827168941497803, "learning_rate": 9.743222287032733e-05, "loss": 1.1726, "step": 796 }, { "epoch": 0.5247304748580364, "grad_norm": 1.3314828872680664, "learning_rate": 9.721829452792946e-05, "loss": 0.8454, "step": 797 }, { "epoch": 0.5253888568842071, "grad_norm": 1.2811524868011475, "learning_rate": 9.700437892521435e-05, "loss": 0.8986, "step": 798 }, { "epoch": 0.5260472389103777, "grad_norm": 1.8064638376235962, "learning_rate": 9.67904770418748e-05, "loss": 0.8644, "step": 799 }, { "epoch": 0.5267056209365484, "grad_norm": 1.8531551361083984, "learning_rate": 9.657658985754085e-05, "loss": 0.9905, "step": 800 }, { "epoch": 0.5273640029627191, "grad_norm": 0.2814549207687378, "learning_rate": 9.63627183517752e-05, "loss": 1.138, "step": 801 }, { "epoch": 0.5280223849888898, "grad_norm": 0.2797883152961731, "learning_rate": 9.614886350406864e-05, "loss": 1.2756, "step": 802 }, { "epoch": 0.5286807670150605, "grad_norm": 0.3077932298183441, "learning_rate": 9.593502629383586e-05, "loss": 1.1792, "step": 803 }, { "epoch": 0.5293391490412311, "grad_norm": 0.29853636026382446, "learning_rate": 9.572120770041064e-05, "loss": 1.1182, "step": 804 }, { "epoch": 0.5299975310674019, "grad_norm": 0.3280797600746155, "learning_rate": 9.55074087030415e-05, "loss": 1.271, "step": 805 }, { "epoch": 0.5306559130935725, "grad_norm": 0.3510564863681793, "learning_rate": 9.529363028088725e-05, "loss": 1.1824, "step": 806 }, { "epoch": 0.5313142951197433, "grad_norm": 0.4004097580909729, "learning_rate": 9.50798734130125e-05, "loss": 1.0833, "step": 807 }, { "epoch": 0.5319726771459139, "grad_norm": 0.4112261235713959, "learning_rate": 9.486613907838306e-05, "loss": 1.2171, "step": 808 }, { "epoch": 0.5326310591720846, "grad_norm": 0.4113888740539551, "learning_rate": 9.465242825586163e-05, "loss": 1.3967, "step": 809 }, { "epoch": 0.5332894411982553, "grad_norm": 0.4097476601600647, "learning_rate": 9.443874192420312e-05, "loss": 1.2218, "step": 810 }, { "epoch": 0.533947823224426, "grad_norm": 0.5748305320739746, "learning_rate": 9.422508106205041e-05, "loss": 1.3205, "step": 811 }, { "epoch": 0.5346062052505967, "grad_norm": 0.5156335830688477, "learning_rate": 9.401144664792969e-05, "loss": 1.3033, "step": 812 }, { "epoch": 0.5352645872767673, "grad_norm": 0.5478305816650391, "learning_rate": 9.379783966024595e-05, "loss": 1.3511, "step": 813 }, { "epoch": 0.535922969302938, "grad_norm": 0.5959839224815369, "learning_rate": 9.358426107727863e-05, "loss": 1.3697, "step": 814 }, { "epoch": 0.5365813513291087, "grad_norm": 0.6457132697105408, "learning_rate": 9.33707118771771e-05, "loss": 1.2885, "step": 815 }, { "epoch": 0.5372397333552794, "grad_norm": 0.713476300239563, "learning_rate": 9.315719303795612e-05, "loss": 1.1412, "step": 816 }, { "epoch": 0.53789811538145, "grad_norm": 0.8098880052566528, "learning_rate": 9.294370553749141e-05, "loss": 1.2196, "step": 817 }, { "epoch": 0.5385564974076208, "grad_norm": 0.9349793791770935, "learning_rate": 9.273025035351526e-05, "loss": 1.3399, "step": 818 }, { "epoch": 0.5392148794337914, "grad_norm": 1.0714279413223267, "learning_rate": 9.251682846361174e-05, "loss": 1.1459, "step": 819 }, { "epoch": 0.5398732614599622, "grad_norm": 1.1117820739746094, "learning_rate": 9.230344084521266e-05, "loss": 0.971, "step": 820 }, { "epoch": 0.5405316434861328, "grad_norm": 1.4912028312683105, "learning_rate": 9.209008847559279e-05, "loss": 1.0989, "step": 821 }, { "epoch": 0.5411900255123036, "grad_norm": 1.46620512008667, "learning_rate": 9.187677233186541e-05, "loss": 1.0167, "step": 822 }, { "epoch": 0.5418484075384742, "grad_norm": 1.8318724632263184, "learning_rate": 9.166349339097796e-05, "loss": 1.1506, "step": 823 }, { "epoch": 0.5425067895646449, "grad_norm": 1.571979284286499, "learning_rate": 9.145025262970757e-05, "loss": 0.9713, "step": 824 }, { "epoch": 0.5431651715908156, "grad_norm": 2.212709665298462, "learning_rate": 9.123705102465627e-05, "loss": 0.9631, "step": 825 }, { "epoch": 0.5438235536169863, "grad_norm": 0.26442617177963257, "learning_rate": 9.102388955224703e-05, "loss": 1.2412, "step": 826 }, { "epoch": 0.544481935643157, "grad_norm": 0.2992679476737976, "learning_rate": 9.081076918871891e-05, "loss": 1.2463, "step": 827 }, { "epoch": 0.5451403176693276, "grad_norm": 0.30470678210258484, "learning_rate": 9.059769091012265e-05, "loss": 1.2727, "step": 828 }, { "epoch": 0.5457986996954983, "grad_norm": 0.32433629035949707, "learning_rate": 9.038465569231633e-05, "loss": 1.285, "step": 829 }, { "epoch": 0.546457081721669, "grad_norm": 0.3425596356391907, "learning_rate": 9.017166451096077e-05, "loss": 1.2225, "step": 830 }, { "epoch": 0.5471154637478397, "grad_norm": 0.3451520800590515, "learning_rate": 8.995871834151512e-05, "loss": 1.3374, "step": 831 }, { "epoch": 0.5477738457740103, "grad_norm": 0.3612419068813324, "learning_rate": 8.974581815923242e-05, "loss": 1.3093, "step": 832 }, { "epoch": 0.5484322278001811, "grad_norm": 0.4222279489040375, "learning_rate": 8.953296493915508e-05, "loss": 1.3272, "step": 833 }, { "epoch": 0.5490906098263517, "grad_norm": 0.40727943181991577, "learning_rate": 8.932015965611039e-05, "loss": 1.2204, "step": 834 }, { "epoch": 0.5497489918525225, "grad_norm": 0.46858417987823486, "learning_rate": 8.910740328470613e-05, "loss": 1.3562, "step": 835 }, { "epoch": 0.5504073738786931, "grad_norm": 0.47174930572509766, "learning_rate": 8.889469679932612e-05, "loss": 1.2713, "step": 836 }, { "epoch": 0.5510657559048638, "grad_norm": 0.5161373019218445, "learning_rate": 8.86820411741256e-05, "loss": 1.3471, "step": 837 }, { "epoch": 0.5517241379310345, "grad_norm": 0.5460351705551147, "learning_rate": 8.846943738302697e-05, "loss": 1.2836, "step": 838 }, { "epoch": 0.5523825199572052, "grad_norm": 0.5956817865371704, "learning_rate": 8.825688639971527e-05, "loss": 1.2484, "step": 839 }, { "epoch": 0.5530409019833759, "grad_norm": 0.6519405841827393, "learning_rate": 8.804438919763353e-05, "loss": 1.1899, "step": 840 }, { "epoch": 0.5536992840095465, "grad_norm": 0.7240451574325562, "learning_rate": 8.783194674997866e-05, "loss": 1.2604, "step": 841 }, { "epoch": 0.5543576660357172, "grad_norm": 0.8447124361991882, "learning_rate": 8.761956002969672e-05, "loss": 1.2472, "step": 842 }, { "epoch": 0.5550160480618879, "grad_norm": 0.9665349125862122, "learning_rate": 8.74072300094785e-05, "loss": 1.1716, "step": 843 }, { "epoch": 0.5556744300880586, "grad_norm": 1.0502985715866089, "learning_rate": 8.719495766175519e-05, "loss": 1.2417, "step": 844 }, { "epoch": 0.5563328121142292, "grad_norm": 1.269997000694275, "learning_rate": 8.698274395869389e-05, "loss": 1.0818, "step": 845 }, { "epoch": 0.5569911941404, "grad_norm": 1.2584283351898193, "learning_rate": 8.677058987219295e-05, "loss": 0.8882, "step": 846 }, { "epoch": 0.5576495761665706, "grad_norm": 1.5052813291549683, "learning_rate": 8.655849637387785e-05, "loss": 0.914, "step": 847 }, { "epoch": 0.5583079581927414, "grad_norm": 1.5458886623382568, "learning_rate": 8.634646443509656e-05, "loss": 0.8089, "step": 848 }, { "epoch": 0.558966340218912, "grad_norm": 1.7558834552764893, "learning_rate": 8.613449502691502e-05, "loss": 1.2515, "step": 849 }, { "epoch": 0.5596247222450828, "grad_norm": 2.395742654800415, "learning_rate": 8.592258912011289e-05, "loss": 1.2407, "step": 850 }, { "epoch": 0.5602831042712534, "grad_norm": 0.2802131474018097, "learning_rate": 8.571074768517903e-05, "loss": 1.1524, "step": 851 }, { "epoch": 0.5609414862974241, "grad_norm": 0.2954949736595154, "learning_rate": 8.549897169230689e-05, "loss": 1.2065, "step": 852 }, { "epoch": 0.5615998683235948, "grad_norm": 0.2986692488193512, "learning_rate": 8.528726211139038e-05, "loss": 1.1301, "step": 853 }, { "epoch": 0.5622582503497654, "grad_norm": 0.33859890699386597, "learning_rate": 8.507561991201909e-05, "loss": 1.1587, "step": 854 }, { "epoch": 0.5629166323759361, "grad_norm": 0.3260893225669861, "learning_rate": 8.486404606347415e-05, "loss": 1.2068, "step": 855 }, { "epoch": 0.5635750144021068, "grad_norm": 0.3559131324291229, "learning_rate": 8.465254153472363e-05, "loss": 1.257, "step": 856 }, { "epoch": 0.5642333964282775, "grad_norm": 0.3739522695541382, "learning_rate": 8.444110729441805e-05, "loss": 1.2309, "step": 857 }, { "epoch": 0.5648917784544482, "grad_norm": 0.4222199022769928, "learning_rate": 8.422974431088607e-05, "loss": 1.2556, "step": 858 }, { "epoch": 0.5655501604806189, "grad_norm": 0.42629897594451904, "learning_rate": 8.401845355213006e-05, "loss": 1.2607, "step": 859 }, { "epoch": 0.5662085425067895, "grad_norm": 0.4653518497943878, "learning_rate": 8.380723598582152e-05, "loss": 1.3993, "step": 860 }, { "epoch": 0.5668669245329603, "grad_norm": 0.47311943769454956, "learning_rate": 8.359609257929679e-05, "loss": 1.2895, "step": 861 }, { "epoch": 0.5675253065591309, "grad_norm": 0.5377811193466187, "learning_rate": 8.338502429955263e-05, "loss": 1.2843, "step": 862 }, { "epoch": 0.5681836885853017, "grad_norm": 0.5767173171043396, "learning_rate": 8.317403211324163e-05, "loss": 1.2701, "step": 863 }, { "epoch": 0.5688420706114723, "grad_norm": 0.6143943071365356, "learning_rate": 8.296311698666792e-05, "loss": 1.2561, "step": 864 }, { "epoch": 0.569500452637643, "grad_norm": 0.72373366355896, "learning_rate": 8.275227988578276e-05, "loss": 1.1408, "step": 865 }, { "epoch": 0.5701588346638137, "grad_norm": 0.7811183929443359, "learning_rate": 8.254152177618e-05, "loss": 1.1143, "step": 866 }, { "epoch": 0.5708172166899843, "grad_norm": 0.8329409956932068, "learning_rate": 8.233084362309172e-05, "loss": 1.0897, "step": 867 }, { "epoch": 0.571475598716155, "grad_norm": 0.9763683676719666, "learning_rate": 8.212024639138397e-05, "loss": 1.2856, "step": 868 }, { "epoch": 0.5721339807423257, "grad_norm": 0.9951071739196777, "learning_rate": 8.190973104555193e-05, "loss": 1.0574, "step": 869 }, { "epoch": 0.5727923627684964, "grad_norm": 1.250862956047058, "learning_rate": 8.169929854971599e-05, "loss": 1.1666, "step": 870 }, { "epoch": 0.5734507447946671, "grad_norm": 1.4167237281799316, "learning_rate": 8.1488949867617e-05, "loss": 1.024, "step": 871 }, { "epoch": 0.5741091268208378, "grad_norm": 1.4582891464233398, "learning_rate": 8.127868596261197e-05, "loss": 0.9205, "step": 872 }, { "epoch": 0.5747675088470084, "grad_norm": 1.6985225677490234, "learning_rate": 8.106850779766961e-05, "loss": 0.8652, "step": 873 }, { "epoch": 0.5754258908731792, "grad_norm": 1.8304520845413208, "learning_rate": 8.08584163353661e-05, "loss": 1.0054, "step": 874 }, { "epoch": 0.5760842728993498, "grad_norm": 1.976290225982666, "learning_rate": 8.064841253788033e-05, "loss": 0.9366, "step": 875 }, { "epoch": 0.5767426549255206, "grad_norm": 0.25687241554260254, "learning_rate": 8.043849736698986e-05, "loss": 1.1735, "step": 876 }, { "epoch": 0.5774010369516912, "grad_norm": 0.2799146771430969, "learning_rate": 8.022867178406631e-05, "loss": 1.1901, "step": 877 }, { "epoch": 0.578059418977862, "grad_norm": 0.30865368247032166, "learning_rate": 8.001893675007098e-05, "loss": 1.2153, "step": 878 }, { "epoch": 0.5787178010040326, "grad_norm": 0.3172372281551361, "learning_rate": 7.980929322555048e-05, "loss": 1.2213, "step": 879 }, { "epoch": 0.5793761830302033, "grad_norm": 0.3272758424282074, "learning_rate": 7.95997421706324e-05, "loss": 1.2642, "step": 880 }, { "epoch": 0.580034565056374, "grad_norm": 0.3429342210292816, "learning_rate": 7.939028454502067e-05, "loss": 1.2277, "step": 881 }, { "epoch": 0.5806929470825446, "grad_norm": 0.3791300654411316, "learning_rate": 7.918092130799151e-05, "loss": 1.1537, "step": 882 }, { "epoch": 0.5813513291087153, "grad_norm": 0.3863638937473297, "learning_rate": 7.89716534183888e-05, "loss": 1.2679, "step": 883 }, { "epoch": 0.582009711134886, "grad_norm": 0.4009682238101959, "learning_rate": 7.876248183461968e-05, "loss": 1.3281, "step": 884 }, { "epoch": 0.5826680931610567, "grad_norm": 0.43973827362060547, "learning_rate": 7.855340751465028e-05, "loss": 1.3269, "step": 885 }, { "epoch": 0.5833264751872274, "grad_norm": 0.4711264669895172, "learning_rate": 7.83444314160013e-05, "loss": 1.162, "step": 886 }, { "epoch": 0.5839848572133981, "grad_norm": 0.4664533734321594, "learning_rate": 7.813555449574357e-05, "loss": 1.1792, "step": 887 }, { "epoch": 0.5846432392395687, "grad_norm": 0.5667819976806641, "learning_rate": 7.79267777104937e-05, "loss": 1.3348, "step": 888 }, { "epoch": 0.5853016212657395, "grad_norm": 0.6101278066635132, "learning_rate": 7.77181020164098e-05, "loss": 1.3424, "step": 889 }, { "epoch": 0.5859600032919101, "grad_norm": 0.7170456051826477, "learning_rate": 7.750952836918677e-05, "loss": 1.2527, "step": 890 }, { "epoch": 0.5866183853180809, "grad_norm": 0.7208893895149231, "learning_rate": 7.730105772405242e-05, "loss": 1.2508, "step": 891 }, { "epoch": 0.5872767673442515, "grad_norm": 0.889305591583252, "learning_rate": 7.709269103576268e-05, "loss": 1.2718, "step": 892 }, { "epoch": 0.5879351493704222, "grad_norm": 1.123354196548462, "learning_rate": 7.68844292585974e-05, "loss": 1.2583, "step": 893 }, { "epoch": 0.5885935313965929, "grad_norm": 1.109710693359375, "learning_rate": 7.667627334635595e-05, "loss": 1.1665, "step": 894 }, { "epoch": 0.5892519134227635, "grad_norm": 1.3003089427947998, "learning_rate": 7.646822425235293e-05, "loss": 1.2072, "step": 895 }, { "epoch": 0.5899102954489343, "grad_norm": 1.3461664915084839, "learning_rate": 7.626028292941361e-05, "loss": 0.9993, "step": 896 }, { "epoch": 0.5905686774751049, "grad_norm": 1.7733616828918457, "learning_rate": 7.605245032986979e-05, "loss": 0.8983, "step": 897 }, { "epoch": 0.5912270595012756, "grad_norm": 1.554667592048645, "learning_rate": 7.584472740555533e-05, "loss": 1.0332, "step": 898 }, { "epoch": 0.5918854415274463, "grad_norm": 1.471677303314209, "learning_rate": 7.563711510780172e-05, "loss": 0.8024, "step": 899 }, { "epoch": 0.592543823553617, "grad_norm": 2.209123373031616, "learning_rate": 7.542961438743389e-05, "loss": 1.2601, "step": 900 }, { "epoch": 0.5932022055797876, "grad_norm": 0.2778402268886566, "learning_rate": 7.52222261947657e-05, "loss": 1.233, "step": 901 }, { "epoch": 0.5938605876059584, "grad_norm": 0.28793758153915405, "learning_rate": 7.501495147959568e-05, "loss": 1.2259, "step": 902 }, { "epoch": 0.594518969632129, "grad_norm": 0.3153240382671356, "learning_rate": 7.480779119120275e-05, "loss": 1.2447, "step": 903 }, { "epoch": 0.5951773516582998, "grad_norm": 0.3157704174518585, "learning_rate": 7.460074627834157e-05, "loss": 1.167, "step": 904 }, { "epoch": 0.5958357336844704, "grad_norm": 0.3390519917011261, "learning_rate": 7.439381768923856e-05, "loss": 1.255, "step": 905 }, { "epoch": 0.5964941157106411, "grad_norm": 0.4002874791622162, "learning_rate": 7.418700637158742e-05, "loss": 1.287, "step": 906 }, { "epoch": 0.5971524977368118, "grad_norm": 0.36236798763275146, "learning_rate": 7.39803132725446e-05, "loss": 1.247, "step": 907 }, { "epoch": 0.5978108797629824, "grad_norm": 0.4082358479499817, "learning_rate": 7.37737393387253e-05, "loss": 1.3088, "step": 908 }, { "epoch": 0.5984692617891532, "grad_norm": 0.4432787597179413, "learning_rate": 7.356728551619894e-05, "loss": 1.3925, "step": 909 }, { "epoch": 0.5991276438153238, "grad_norm": 0.45395582914352417, "learning_rate": 7.336095275048474e-05, "loss": 1.3254, "step": 910 }, { "epoch": 0.5997860258414945, "grad_norm": 0.4909254014492035, "learning_rate": 7.315474198654757e-05, "loss": 1.3132, "step": 911 }, { "epoch": 0.6004444078676652, "grad_norm": 0.5084662437438965, "learning_rate": 7.294865416879366e-05, "loss": 1.3051, "step": 912 }, { "epoch": 0.6011027898938359, "grad_norm": 0.5864781141281128, "learning_rate": 7.274269024106601e-05, "loss": 1.3598, "step": 913 }, { "epoch": 0.6017611719200066, "grad_norm": 0.6329978108406067, "learning_rate": 7.253685114664028e-05, "loss": 1.3116, "step": 914 }, { "epoch": 0.6024195539461773, "grad_norm": 0.725957989692688, "learning_rate": 7.23311378282205e-05, "loss": 1.4055, "step": 915 }, { "epoch": 0.6030779359723479, "grad_norm": 0.8368108868598938, "learning_rate": 7.212555122793452e-05, "loss": 1.306, "step": 916 }, { "epoch": 0.6037363179985187, "grad_norm": 0.9409517049789429, "learning_rate": 7.192009228732992e-05, "loss": 1.3006, "step": 917 }, { "epoch": 0.6043947000246893, "grad_norm": 1.0104981660842896, "learning_rate": 7.171476194736975e-05, "loss": 1.1905, "step": 918 }, { "epoch": 0.6050530820508601, "grad_norm": 1.1959478855133057, "learning_rate": 7.150956114842781e-05, "loss": 1.337, "step": 919 }, { "epoch": 0.6057114640770307, "grad_norm": 1.1645923852920532, "learning_rate": 7.130449083028487e-05, "loss": 1.0021, "step": 920 }, { "epoch": 0.6063698461032014, "grad_norm": 1.4529732465744019, "learning_rate": 7.109955193212406e-05, "loss": 1.2319, "step": 921 }, { "epoch": 0.6070282281293721, "grad_norm": 1.368401050567627, "learning_rate": 7.089474539252657e-05, "loss": 0.9934, "step": 922 }, { "epoch": 0.6076866101555427, "grad_norm": 1.534131646156311, "learning_rate": 7.069007214946743e-05, "loss": 1.0236, "step": 923 }, { "epoch": 0.6083449921817135, "grad_norm": 1.9575016498565674, "learning_rate": 7.048553314031132e-05, "loss": 1.1437, "step": 924 }, { "epoch": 0.6090033742078841, "grad_norm": 2.49653697013855, "learning_rate": 7.028112930180794e-05, "loss": 1.3047, "step": 925 }, { "epoch": 0.6096617562340548, "grad_norm": 0.2791464030742645, "learning_rate": 7.00768615700881e-05, "loss": 1.1377, "step": 926 }, { "epoch": 0.6103201382602255, "grad_norm": 0.28302690386772156, "learning_rate": 6.987273088065927e-05, "loss": 1.243, "step": 927 }, { "epoch": 0.6109785202863962, "grad_norm": 0.31983238458633423, "learning_rate": 6.966873816840114e-05, "loss": 1.2687, "step": 928 }, { "epoch": 0.6116369023125668, "grad_norm": 0.34805828332901, "learning_rate": 6.946488436756164e-05, "loss": 1.3078, "step": 929 }, { "epoch": 0.6122952843387376, "grad_norm": 0.33764809370040894, "learning_rate": 6.92611704117525e-05, "loss": 1.2175, "step": 930 }, { "epoch": 0.6129536663649082, "grad_norm": 0.369502991437912, "learning_rate": 6.905759723394488e-05, "loss": 1.1722, "step": 931 }, { "epoch": 0.613612048391079, "grad_norm": 0.38398030400276184, "learning_rate": 6.885416576646525e-05, "loss": 1.28, "step": 932 }, { "epoch": 0.6142704304172496, "grad_norm": 0.4120824337005615, "learning_rate": 6.865087694099119e-05, "loss": 1.3627, "step": 933 }, { "epoch": 0.6149288124434203, "grad_norm": 0.4324477016925812, "learning_rate": 6.844773168854686e-05, "loss": 1.2828, "step": 934 }, { "epoch": 0.615587194469591, "grad_norm": 0.4589644968509674, "learning_rate": 6.824473093949893e-05, "loss": 1.2969, "step": 935 }, { "epoch": 0.6162455764957616, "grad_norm": 0.47269025444984436, "learning_rate": 6.804187562355231e-05, "loss": 1.3824, "step": 936 }, { "epoch": 0.6169039585219324, "grad_norm": 0.48048487305641174, "learning_rate": 6.783916666974577e-05, "loss": 1.1838, "step": 937 }, { "epoch": 0.617562340548103, "grad_norm": 0.5445598363876343, "learning_rate": 6.763660500644783e-05, "loss": 1.2797, "step": 938 }, { "epoch": 0.6182207225742737, "grad_norm": 0.5924782752990723, "learning_rate": 6.74341915613525e-05, "loss": 1.3008, "step": 939 }, { "epoch": 0.6188791046004444, "grad_norm": 0.7298708558082581, "learning_rate": 6.723192726147483e-05, "loss": 1.3207, "step": 940 }, { "epoch": 0.6195374866266151, "grad_norm": 0.6842146515846252, "learning_rate": 6.702981303314694e-05, "loss": 1.034, "step": 941 }, { "epoch": 0.6201958686527858, "grad_norm": 0.8297483325004578, "learning_rate": 6.682784980201363e-05, "loss": 1.1227, "step": 942 }, { "epoch": 0.6208542506789565, "grad_norm": 0.8973710536956787, "learning_rate": 6.66260384930281e-05, "loss": 1.2266, "step": 943 }, { "epoch": 0.6215126327051271, "grad_norm": 1.0918468236923218, "learning_rate": 6.642438003044781e-05, "loss": 1.0894, "step": 944 }, { "epoch": 0.6221710147312979, "grad_norm": 1.386069655418396, "learning_rate": 6.622287533783031e-05, "loss": 1.0723, "step": 945 }, { "epoch": 0.6228293967574685, "grad_norm": 1.1602648496627808, "learning_rate": 6.60215253380287e-05, "loss": 0.9151, "step": 946 }, { "epoch": 0.6234877787836393, "grad_norm": 1.670488953590393, "learning_rate": 6.582033095318785e-05, "loss": 1.0987, "step": 947 }, { "epoch": 0.6241461608098099, "grad_norm": 1.558658242225647, "learning_rate": 6.561929310473971e-05, "loss": 1.0037, "step": 948 }, { "epoch": 0.6248045428359805, "grad_norm": 2.0037412643432617, "learning_rate": 6.541841271339954e-05, "loss": 1.3559, "step": 949 }, { "epoch": 0.6254629248621513, "grad_norm": 2.0093250274658203, "learning_rate": 6.521769069916135e-05, "loss": 0.9248, "step": 950 }, { "epoch": 0.6261213068883219, "grad_norm": 0.25129470229148865, "learning_rate": 6.501712798129386e-05, "loss": 1.1905, "step": 951 }, { "epoch": 0.6267796889144926, "grad_norm": 0.279048353433609, "learning_rate": 6.481672547833619e-05, "loss": 1.2209, "step": 952 }, { "epoch": 0.6274380709406633, "grad_norm": 0.28804081678390503, "learning_rate": 6.461648410809379e-05, "loss": 1.2236, "step": 953 }, { "epoch": 0.628096452966834, "grad_norm": 0.30973637104034424, "learning_rate": 6.441640478763405e-05, "loss": 1.1995, "step": 954 }, { "epoch": 0.6287548349930047, "grad_norm": 0.3179713189601898, "learning_rate": 6.421648843328229e-05, "loss": 1.2453, "step": 955 }, { "epoch": 0.6294132170191754, "grad_norm": 0.3436993360519409, "learning_rate": 6.401673596061747e-05, "loss": 1.2232, "step": 956 }, { "epoch": 0.630071599045346, "grad_norm": 0.3664078712463379, "learning_rate": 6.381714828446792e-05, "loss": 1.2657, "step": 957 }, { "epoch": 0.6307299810715168, "grad_norm": 0.4007686674594879, "learning_rate": 6.361772631890735e-05, "loss": 1.3071, "step": 958 }, { "epoch": 0.6313883630976874, "grad_norm": 0.45302218198776245, "learning_rate": 6.341847097725044e-05, "loss": 1.303, "step": 959 }, { "epoch": 0.6320467451238582, "grad_norm": 0.46928587555885315, "learning_rate": 6.321938317204885e-05, "loss": 1.3362, "step": 960 }, { "epoch": 0.6327051271500288, "grad_norm": 0.4900846481323242, "learning_rate": 6.302046381508686e-05, "loss": 1.3513, "step": 961 }, { "epoch": 0.6333635091761994, "grad_norm": 0.5181142687797546, "learning_rate": 6.282171381737741e-05, "loss": 1.3339, "step": 962 }, { "epoch": 0.6340218912023702, "grad_norm": 0.5704917311668396, "learning_rate": 6.262313408915772e-05, "loss": 1.3352, "step": 963 }, { "epoch": 0.6346802732285408, "grad_norm": 0.6403287053108215, "learning_rate": 6.242472553988521e-05, "loss": 1.3113, "step": 964 }, { "epoch": 0.6353386552547116, "grad_norm": 0.755416214466095, "learning_rate": 6.22264890782334e-05, "loss": 1.2442, "step": 965 }, { "epoch": 0.6359970372808822, "grad_norm": 0.6903100609779358, "learning_rate": 6.202842561208758e-05, "loss": 1.2622, "step": 966 }, { "epoch": 0.6366554193070529, "grad_norm": 0.8026793003082275, "learning_rate": 6.183053604854081e-05, "loss": 1.2804, "step": 967 }, { "epoch": 0.6373138013332236, "grad_norm": 0.9257503747940063, "learning_rate": 6.163282129388981e-05, "loss": 1.1607, "step": 968 }, { "epoch": 0.6379721833593943, "grad_norm": 1.1452815532684326, "learning_rate": 6.143528225363045e-05, "loss": 1.255, "step": 969 }, { "epoch": 0.638630565385565, "grad_norm": 1.2014694213867188, "learning_rate": 6.123791983245411e-05, "loss": 1.2015, "step": 970 }, { "epoch": 0.6392889474117357, "grad_norm": 1.2960498332977295, "learning_rate": 6.104073493424321e-05, "loss": 1.1026, "step": 971 }, { "epoch": 0.6399473294379063, "grad_norm": 1.4432493448257446, "learning_rate": 6.084372846206708e-05, "loss": 0.973, "step": 972 }, { "epoch": 0.6406057114640771, "grad_norm": 1.4832402467727661, "learning_rate": 6.0646901318177986e-05, "loss": 0.9008, "step": 973 }, { "epoch": 0.6412640934902477, "grad_norm": 1.7388958930969238, "learning_rate": 6.0450254404006845e-05, "loss": 0.8442, "step": 974 }, { "epoch": 0.6419224755164185, "grad_norm": 1.832165002822876, "learning_rate": 6.025378862015916e-05, "loss": 1.0636, "step": 975 }, { "epoch": 0.6425808575425891, "grad_norm": 0.2705865800380707, "learning_rate": 6.005750486641095e-05, "loss": 1.1149, "step": 976 }, { "epoch": 0.6432392395687597, "grad_norm": 0.29046082496643066, "learning_rate": 5.9861404041704514e-05, "loss": 1.1809, "step": 977 }, { "epoch": 0.6438976215949305, "grad_norm": 0.28934773802757263, "learning_rate": 5.9665487044144367e-05, "loss": 1.2279, "step": 978 }, { "epoch": 0.6445560036211011, "grad_norm": 0.31610801815986633, "learning_rate": 5.946975477099317e-05, "loss": 1.2745, "step": 979 }, { "epoch": 0.6452143856472718, "grad_norm": 0.3320939838886261, "learning_rate": 5.9274208118667565e-05, "loss": 1.2947, "step": 980 }, { "epoch": 0.6458727676734425, "grad_norm": 0.35664987564086914, "learning_rate": 5.9078847982734045e-05, "loss": 1.1646, "step": 981 }, { "epoch": 0.6465311496996132, "grad_norm": 0.36069172620773315, "learning_rate": 5.8883675257904936e-05, "loss": 1.2487, "step": 982 }, { "epoch": 0.6471895317257839, "grad_norm": 0.38387221097946167, "learning_rate": 5.868869083803431e-05, "loss": 1.222, "step": 983 }, { "epoch": 0.6478479137519546, "grad_norm": 0.4363352358341217, "learning_rate": 5.849389561611371e-05, "loss": 1.432, "step": 984 }, { "epoch": 0.6485062957781252, "grad_norm": 0.4462944269180298, "learning_rate": 5.829929048426828e-05, "loss": 1.1995, "step": 985 }, { "epoch": 0.649164677804296, "grad_norm": 0.44570255279541016, "learning_rate": 5.810487633375261e-05, "loss": 1.2647, "step": 986 }, { "epoch": 0.6498230598304666, "grad_norm": 0.4554101526737213, "learning_rate": 5.791065405494647e-05, "loss": 1.2046, "step": 987 }, { "epoch": 0.6504814418566374, "grad_norm": 0.5013126730918884, "learning_rate": 5.77166245373511e-05, "loss": 1.2915, "step": 988 }, { "epoch": 0.651139823882808, "grad_norm": 0.5789803266525269, "learning_rate": 5.752278866958487e-05, "loss": 1.3287, "step": 989 }, { "epoch": 0.6517982059089786, "grad_norm": 0.6442369222640991, "learning_rate": 5.732914733937916e-05, "loss": 1.2078, "step": 990 }, { "epoch": 0.6524565879351494, "grad_norm": 0.7384447455406189, "learning_rate": 5.713570143357452e-05, "loss": 1.2361, "step": 991 }, { "epoch": 0.65311496996132, "grad_norm": 0.8517095446586609, "learning_rate": 5.694245183811644e-05, "loss": 1.1178, "step": 992 }, { "epoch": 0.6537733519874908, "grad_norm": 1.0125716924667358, "learning_rate": 5.6749399438051396e-05, "loss": 1.2424, "step": 993 }, { "epoch": 0.6544317340136614, "grad_norm": 1.0817424058914185, "learning_rate": 5.655654511752274e-05, "loss": 1.2225, "step": 994 }, { "epoch": 0.6550901160398321, "grad_norm": 1.2550139427185059, "learning_rate": 5.636388975976652e-05, "loss": 1.0147, "step": 995 }, { "epoch": 0.6557484980660028, "grad_norm": 1.4273148775100708, "learning_rate": 5.6171434247107776e-05, "loss": 1.0333, "step": 996 }, { "epoch": 0.6564068800921735, "grad_norm": 1.553175687789917, "learning_rate": 5.5979179460956255e-05, "loss": 1.0193, "step": 997 }, { "epoch": 0.6570652621183442, "grad_norm": 1.6307944059371948, "learning_rate": 5.578712628180225e-05, "loss": 1.2256, "step": 998 }, { "epoch": 0.6577236441445149, "grad_norm": 1.5277482271194458, "learning_rate": 5.5595275589212846e-05, "loss": 1.0016, "step": 999 }, { "epoch": 0.6583820261706855, "grad_norm": 2.4595751762390137, "learning_rate": 5.540362826182791e-05, "loss": 1.2254, "step": 1000 }, { "epoch": 0.6590404081968563, "grad_norm": 0.25844258069992065, "learning_rate": 5.521218517735568e-05, "loss": 1.1153, "step": 1001 }, { "epoch": 0.6596987902230269, "grad_norm": 0.2815786600112915, "learning_rate": 5.502094721256916e-05, "loss": 1.2309, "step": 1002 }, { "epoch": 0.6603571722491975, "grad_norm": 0.280496746301651, "learning_rate": 5.482991524330192e-05, "loss": 1.1165, "step": 1003 }, { "epoch": 0.6610155542753683, "grad_norm": 0.3004969358444214, "learning_rate": 5.4639090144444084e-05, "loss": 1.1658, "step": 1004 }, { "epoch": 0.6616739363015389, "grad_norm": 0.3371525704860687, "learning_rate": 5.444847278993836e-05, "loss": 1.2179, "step": 1005 }, { "epoch": 0.6623323183277097, "grad_norm": 0.3354127109050751, "learning_rate": 5.425806405277609e-05, "loss": 1.218, "step": 1006 }, { "epoch": 0.6629907003538803, "grad_norm": 0.34831690788269043, "learning_rate": 5.406786480499304e-05, "loss": 1.1558, "step": 1007 }, { "epoch": 0.663649082380051, "grad_norm": 0.39226022362709045, "learning_rate": 5.387787591766562e-05, "loss": 1.3391, "step": 1008 }, { "epoch": 0.6643074644062217, "grad_norm": 0.41597703099250793, "learning_rate": 5.3688098260907005e-05, "loss": 1.1732, "step": 1009 }, { "epoch": 0.6649658464323924, "grad_norm": 0.43895408511161804, "learning_rate": 5.349853270386268e-05, "loss": 1.2245, "step": 1010 }, { "epoch": 0.6656242284585631, "grad_norm": 0.5116786956787109, "learning_rate": 5.3309180114706936e-05, "loss": 1.3439, "step": 1011 }, { "epoch": 0.6662826104847338, "grad_norm": 0.5254409909248352, "learning_rate": 5.312004136063866e-05, "loss": 1.2908, "step": 1012 }, { "epoch": 0.6669409925109044, "grad_norm": 0.6241359114646912, "learning_rate": 5.2931117307877454e-05, "loss": 1.2986, "step": 1013 }, { "epoch": 0.6675993745370752, "grad_norm": 0.6010854244232178, "learning_rate": 5.2742408821659574e-05, "loss": 1.2869, "step": 1014 }, { "epoch": 0.6682577565632458, "grad_norm": 0.6964030265808105, "learning_rate": 5.255391676623408e-05, "loss": 1.2385, "step": 1015 }, { "epoch": 0.6689161385894165, "grad_norm": 0.7789903879165649, "learning_rate": 5.23656420048587e-05, "loss": 1.3629, "step": 1016 }, { "epoch": 0.6695745206155872, "grad_norm": 0.82118159532547, "learning_rate": 5.2177585399796155e-05, "loss": 1.153, "step": 1017 }, { "epoch": 0.6702329026417578, "grad_norm": 1.0225419998168945, "learning_rate": 5.198974781231003e-05, "loss": 1.1674, "step": 1018 }, { "epoch": 0.6708912846679286, "grad_norm": 1.048144817352295, "learning_rate": 5.180213010266068e-05, "loss": 1.1287, "step": 1019 }, { "epoch": 0.6715496666940992, "grad_norm": 1.191335678100586, "learning_rate": 5.161473313010162e-05, "loss": 1.075, "step": 1020 }, { "epoch": 0.67220804872027, "grad_norm": 1.3267505168914795, "learning_rate": 5.142755775287549e-05, "loss": 1.2235, "step": 1021 }, { "epoch": 0.6728664307464406, "grad_norm": 1.4214344024658203, "learning_rate": 5.124060482820986e-05, "loss": 0.95, "step": 1022 }, { "epoch": 0.6735248127726113, "grad_norm": 1.4128987789154053, "learning_rate": 5.1053875212313654e-05, "loss": 0.9548, "step": 1023 }, { "epoch": 0.674183194798782, "grad_norm": 1.7373591661453247, "learning_rate": 5.086736976037304e-05, "loss": 1.2802, "step": 1024 }, { "epoch": 0.6748415768249527, "grad_norm": 2.1737067699432373, "learning_rate": 5.068108932654758e-05, "loss": 0.9537, "step": 1025 }, { "epoch": 0.6754999588511233, "grad_norm": 0.2549096941947937, "learning_rate": 5.049503476396627e-05, "loss": 1.1644, "step": 1026 }, { "epoch": 0.6761583408772941, "grad_norm": 0.2823541462421417, "learning_rate": 5.0309206924723716e-05, "loss": 1.2863, "step": 1027 }, { "epoch": 0.6768167229034647, "grad_norm": 0.2980341911315918, "learning_rate": 5.012360665987606e-05, "loss": 1.2424, "step": 1028 }, { "epoch": 0.6774751049296355, "grad_norm": 0.3262587785720825, "learning_rate": 4.993823481943726e-05, "loss": 1.2237, "step": 1029 }, { "epoch": 0.6781334869558061, "grad_norm": 0.33284589648246765, "learning_rate": 4.975309225237524e-05, "loss": 1.2683, "step": 1030 }, { "epoch": 0.6787918689819767, "grad_norm": 0.35492652654647827, "learning_rate": 4.9568179806607704e-05, "loss": 1.1633, "step": 1031 }, { "epoch": 0.6794502510081475, "grad_norm": 0.3714054226875305, "learning_rate": 4.9383498328998554e-05, "loss": 1.2235, "step": 1032 }, { "epoch": 0.6801086330343181, "grad_norm": 0.4089430570602417, "learning_rate": 4.91990486653539e-05, "loss": 1.2308, "step": 1033 }, { "epoch": 0.6807670150604889, "grad_norm": 0.4130793809890747, "learning_rate": 4.901483166041815e-05, "loss": 1.2542, "step": 1034 }, { "epoch": 0.6814253970866595, "grad_norm": 0.44798097014427185, "learning_rate": 4.88308481578702e-05, "loss": 1.2694, "step": 1035 }, { "epoch": 0.6820837791128302, "grad_norm": 0.4866581857204437, "learning_rate": 4.864709900031961e-05, "loss": 1.2465, "step": 1036 }, { "epoch": 0.6827421611390009, "grad_norm": 0.525903582572937, "learning_rate": 4.8463585029302495e-05, "loss": 1.2073, "step": 1037 }, { "epoch": 0.6834005431651716, "grad_norm": 0.5323742628097534, "learning_rate": 4.8280307085278144e-05, "loss": 1.2234, "step": 1038 }, { "epoch": 0.6840589251913423, "grad_norm": 0.6354827284812927, "learning_rate": 4.809726600762461e-05, "loss": 1.1542, "step": 1039 }, { "epoch": 0.684717307217513, "grad_norm": 0.7236683964729309, "learning_rate": 4.7914462634635315e-05, "loss": 1.2184, "step": 1040 }, { "epoch": 0.6853756892436836, "grad_norm": 0.8659993410110474, "learning_rate": 4.773189780351498e-05, "loss": 1.3878, "step": 1041 }, { "epoch": 0.6860340712698544, "grad_norm": 0.9045376181602478, "learning_rate": 4.754957235037586e-05, "loss": 1.2035, "step": 1042 }, { "epoch": 0.686692453296025, "grad_norm": 1.003937005996704, "learning_rate": 4.736748711023391e-05, "loss": 1.1478, "step": 1043 }, { "epoch": 0.6873508353221957, "grad_norm": 1.076656699180603, "learning_rate": 4.718564291700497e-05, "loss": 1.1958, "step": 1044 }, { "epoch": 0.6880092173483664, "grad_norm": 1.2444761991500854, "learning_rate": 4.700404060350082e-05, "loss": 1.2481, "step": 1045 }, { "epoch": 0.688667599374537, "grad_norm": 1.4935905933380127, "learning_rate": 4.682268100142566e-05, "loss": 1.2006, "step": 1046 }, { "epoch": 0.6893259814007078, "grad_norm": 1.621973991394043, "learning_rate": 4.664156494137203e-05, "loss": 1.119, "step": 1047 }, { "epoch": 0.6899843634268784, "grad_norm": 1.7240283489227295, "learning_rate": 4.6460693252817e-05, "loss": 1.3676, "step": 1048 }, { "epoch": 0.6906427454530492, "grad_norm": 2.443277359008789, "learning_rate": 4.628006676411861e-05, "loss": 0.7995, "step": 1049 }, { "epoch": 0.6913011274792198, "grad_norm": 3.4616079330444336, "learning_rate": 4.609968630251187e-05, "loss": 1.322, "step": 1050 }, { "epoch": 0.6919595095053905, "grad_norm": 0.26153790950775146, "learning_rate": 4.591955269410502e-05, "loss": 1.0635, "step": 1051 }, { "epoch": 0.6926178915315612, "grad_norm": 0.3033198118209839, "learning_rate": 4.573966676387579e-05, "loss": 1.2605, "step": 1052 }, { "epoch": 0.6932762735577319, "grad_norm": 0.31014177203178406, "learning_rate": 4.5560029335667566e-05, "loss": 1.2265, "step": 1053 }, { "epoch": 0.6939346555839025, "grad_norm": 0.3218647837638855, "learning_rate": 4.538064123218565e-05, "loss": 1.1889, "step": 1054 }, { "epoch": 0.6945930376100733, "grad_norm": 0.3382401764392853, "learning_rate": 4.520150327499351e-05, "loss": 1.2482, "step": 1055 }, { "epoch": 0.6952514196362439, "grad_norm": 0.35672977566719055, "learning_rate": 4.502261628450898e-05, "loss": 1.2232, "step": 1056 }, { "epoch": 0.6959098016624146, "grad_norm": 0.3911837935447693, "learning_rate": 4.484398108000043e-05, "loss": 1.2294, "step": 1057 }, { "epoch": 0.6965681836885853, "grad_norm": 0.3901582360267639, "learning_rate": 4.466559847958318e-05, "loss": 1.1817, "step": 1058 }, { "epoch": 0.6972265657147559, "grad_norm": 0.42149317264556885, "learning_rate": 4.448746930021577e-05, "loss": 1.2612, "step": 1059 }, { "epoch": 0.6978849477409267, "grad_norm": 0.45483702421188354, "learning_rate": 4.430959435769589e-05, "loss": 1.2763, "step": 1060 }, { "epoch": 0.6985433297670973, "grad_norm": 0.4777740240097046, "learning_rate": 4.4131974466657053e-05, "loss": 1.2458, "step": 1061 }, { "epoch": 0.6992017117932681, "grad_norm": 0.5076452493667603, "learning_rate": 4.395461044056461e-05, "loss": 1.299, "step": 1062 }, { "epoch": 0.6998600938194387, "grad_norm": 0.547406792640686, "learning_rate": 4.377750309171211e-05, "loss": 1.3432, "step": 1063 }, { "epoch": 0.7005184758456094, "grad_norm": 0.5847563743591309, "learning_rate": 4.3600653231217595e-05, "loss": 1.267, "step": 1064 }, { "epoch": 0.7011768578717801, "grad_norm": 0.649928629398346, "learning_rate": 4.3424061669019856e-05, "loss": 1.1567, "step": 1065 }, { "epoch": 0.7018352398979508, "grad_norm": 0.8520046472549438, "learning_rate": 4.32477292138746e-05, "loss": 1.2407, "step": 1066 }, { "epoch": 0.7024936219241215, "grad_norm": 0.7802565097808838, "learning_rate": 4.30716566733511e-05, "loss": 1.0781, "step": 1067 }, { "epoch": 0.7031520039502922, "grad_norm": 0.9612601399421692, "learning_rate": 4.289584485382816e-05, "loss": 1.0628, "step": 1068 }, { "epoch": 0.7038103859764628, "grad_norm": 1.143847107887268, "learning_rate": 4.2720294560490424e-05, "loss": 1.3889, "step": 1069 }, { "epoch": 0.7044687680026336, "grad_norm": 1.1900006532669067, "learning_rate": 4.2545006597324954e-05, "loss": 1.0227, "step": 1070 }, { "epoch": 0.7051271500288042, "grad_norm": 1.1799747943878174, "learning_rate": 4.236998176711732e-05, "loss": 1.0563, "step": 1071 }, { "epoch": 0.7057855320549749, "grad_norm": 1.4355189800262451, "learning_rate": 4.2195220871448e-05, "loss": 1.1305, "step": 1072 }, { "epoch": 0.7064439140811456, "grad_norm": 1.5447733402252197, "learning_rate": 4.202072471068873e-05, "loss": 1.1425, "step": 1073 }, { "epoch": 0.7071022961073162, "grad_norm": 1.3460900783538818, "learning_rate": 4.1846494083998757e-05, "loss": 0.7108, "step": 1074 }, { "epoch": 0.707760678133487, "grad_norm": 1.8062644004821777, "learning_rate": 4.167252978932127e-05, "loss": 0.9145, "step": 1075 }, { "epoch": 0.7084190601596576, "grad_norm": 0.2656218111515045, "learning_rate": 4.149883262337969e-05, "loss": 1.1331, "step": 1076 }, { "epoch": 0.7090774421858284, "grad_norm": 0.28789886832237244, "learning_rate": 4.132540338167411e-05, "loss": 1.2669, "step": 1077 }, { "epoch": 0.709735824211999, "grad_norm": 0.29201340675354004, "learning_rate": 4.115224285847743e-05, "loss": 1.2453, "step": 1078 }, { "epoch": 0.7103942062381697, "grad_norm": 0.31592023372650146, "learning_rate": 4.0979351846831945e-05, "loss": 1.2386, "step": 1079 }, { "epoch": 0.7110525882643404, "grad_norm": 0.31521445512771606, "learning_rate": 4.080673113854579e-05, "loss": 1.1515, "step": 1080 }, { "epoch": 0.7117109702905111, "grad_norm": 0.33650001883506775, "learning_rate": 4.06343815241889e-05, "loss": 1.2852, "step": 1081 }, { "epoch": 0.7123693523166817, "grad_norm": 0.38492700457572937, "learning_rate": 4.046230379308982e-05, "loss": 1.1759, "step": 1082 }, { "epoch": 0.7130277343428525, "grad_norm": 0.37704896926879883, "learning_rate": 4.029049873333187e-05, "loss": 1.2402, "step": 1083 }, { "epoch": 0.7136861163690231, "grad_norm": 0.414652019739151, "learning_rate": 4.0118967131749595e-05, "loss": 1.2345, "step": 1084 }, { "epoch": 0.7143444983951938, "grad_norm": 0.4506089687347412, "learning_rate": 3.9947709773925176e-05, "loss": 1.3202, "step": 1085 }, { "epoch": 0.7150028804213645, "grad_norm": 0.49017423391342163, "learning_rate": 3.9776727444184744e-05, "loss": 1.4229, "step": 1086 }, { "epoch": 0.7156612624475351, "grad_norm": 0.5030205845832825, "learning_rate": 3.9606020925594875e-05, "loss": 1.2759, "step": 1087 }, { "epoch": 0.7163196444737059, "grad_norm": 0.49796900153160095, "learning_rate": 3.943559099995911e-05, "loss": 1.2619, "step": 1088 }, { "epoch": 0.7169780264998765, "grad_norm": 0.5899502038955688, "learning_rate": 3.926543844781405e-05, "loss": 1.2904, "step": 1089 }, { "epoch": 0.7176364085260473, "grad_norm": 0.6521037817001343, "learning_rate": 3.9095564048426094e-05, "loss": 1.1059, "step": 1090 }, { "epoch": 0.7182947905522179, "grad_norm": 0.8144060969352722, "learning_rate": 3.892596857978772e-05, "loss": 1.2482, "step": 1091 }, { "epoch": 0.7189531725783886, "grad_norm": 0.9115782976150513, "learning_rate": 3.875665281861397e-05, "loss": 1.3264, "step": 1092 }, { "epoch": 0.7196115546045593, "grad_norm": 1.0456653833389282, "learning_rate": 3.858761754033888e-05, "loss": 1.0351, "step": 1093 }, { "epoch": 0.72026993663073, "grad_norm": 0.9794834852218628, "learning_rate": 3.8418863519111946e-05, "loss": 0.9958, "step": 1094 }, { "epoch": 0.7209283186569007, "grad_norm": 1.2594164609909058, "learning_rate": 3.825039152779444e-05, "loss": 1.0091, "step": 1095 }, { "epoch": 0.7215867006830714, "grad_norm": 1.4557605981826782, "learning_rate": 3.808220233795618e-05, "loss": 1.2996, "step": 1096 }, { "epoch": 0.722245082709242, "grad_norm": 1.5663079023361206, "learning_rate": 3.7914296719871726e-05, "loss": 1.0816, "step": 1097 }, { "epoch": 0.7229034647354127, "grad_norm": 1.6405770778656006, "learning_rate": 3.774667544251683e-05, "loss": 0.9198, "step": 1098 }, { "epoch": 0.7235618467615834, "grad_norm": 1.8190784454345703, "learning_rate": 3.7579339273565175e-05, "loss": 0.9685, "step": 1099 }, { "epoch": 0.724220228787754, "grad_norm": 2.4316556453704834, "learning_rate": 3.7412288979384605e-05, "loss": 1.3515, "step": 1100 }, { "epoch": 0.7248786108139248, "grad_norm": 0.2535538375377655, "learning_rate": 3.724552532503374e-05, "loss": 1.1048, "step": 1101 }, { "epoch": 0.7255369928400954, "grad_norm": 0.28741225600242615, "learning_rate": 3.7079049074258466e-05, "loss": 1.2503, "step": 1102 }, { "epoch": 0.7261953748662662, "grad_norm": 0.27948105335235596, "learning_rate": 3.691286098948837e-05, "loss": 1.1109, "step": 1103 }, { "epoch": 0.7268537568924368, "grad_norm": 0.3239017128944397, "learning_rate": 3.674696183183334e-05, "loss": 1.2204, "step": 1104 }, { "epoch": 0.7275121389186076, "grad_norm": 0.3249548077583313, "learning_rate": 3.658135236108e-05, "loss": 1.2741, "step": 1105 }, { "epoch": 0.7281705209447782, "grad_norm": 0.33894842863082886, "learning_rate": 3.641603333568831e-05, "loss": 1.1956, "step": 1106 }, { "epoch": 0.7288289029709489, "grad_norm": 0.3534873425960541, "learning_rate": 3.625100551278794e-05, "loss": 1.2714, "step": 1107 }, { "epoch": 0.7294872849971196, "grad_norm": 0.38609808683395386, "learning_rate": 3.6086269648174964e-05, "loss": 1.2652, "step": 1108 }, { "epoch": 0.7301456670232903, "grad_norm": 0.40789276361465454, "learning_rate": 3.5921826496308476e-05, "loss": 1.2157, "step": 1109 }, { "epoch": 0.730804049049461, "grad_norm": 0.47272974252700806, "learning_rate": 3.575767681030677e-05, "loss": 1.4159, "step": 1110 }, { "epoch": 0.7314624310756316, "grad_norm": 0.45874321460723877, "learning_rate": 3.559382134194427e-05, "loss": 1.2688, "step": 1111 }, { "epoch": 0.7321208131018023, "grad_norm": 0.4879624843597412, "learning_rate": 3.543026084164789e-05, "loss": 1.1683, "step": 1112 }, { "epoch": 0.732779195127973, "grad_norm": 0.5352937579154968, "learning_rate": 3.5266996058493665e-05, "loss": 1.3928, "step": 1113 }, { "epoch": 0.7334375771541437, "grad_norm": 0.5928652882575989, "learning_rate": 3.5104027740203304e-05, "loss": 1.129, "step": 1114 }, { "epoch": 0.7340959591803143, "grad_norm": 0.6211274862289429, "learning_rate": 3.494135663314077e-05, "loss": 1.2917, "step": 1115 }, { "epoch": 0.7347543412064851, "grad_norm": 0.7669411301612854, "learning_rate": 3.4778983482308746e-05, "loss": 1.1392, "step": 1116 }, { "epoch": 0.7354127232326557, "grad_norm": 0.9671489000320435, "learning_rate": 3.4616909031345533e-05, "loss": 1.3572, "step": 1117 }, { "epoch": 0.7360711052588265, "grad_norm": 1.1423603296279907, "learning_rate": 3.445513402252132e-05, "loss": 1.3504, "step": 1118 }, { "epoch": 0.7367294872849971, "grad_norm": 1.0919145345687866, "learning_rate": 3.429365919673486e-05, "loss": 1.2875, "step": 1119 }, { "epoch": 0.7373878693111678, "grad_norm": 1.0725616216659546, "learning_rate": 3.413248529351023e-05, "loss": 1.1314, "step": 1120 }, { "epoch": 0.7380462513373385, "grad_norm": 1.3317692279815674, "learning_rate": 3.39716130509933e-05, "loss": 0.8813, "step": 1121 }, { "epoch": 0.7387046333635092, "grad_norm": 1.4635097980499268, "learning_rate": 3.3811043205948366e-05, "loss": 1.0086, "step": 1122 }, { "epoch": 0.7393630153896799, "grad_norm": 1.4112662076950073, "learning_rate": 3.365077649375483e-05, "loss": 0.8766, "step": 1123 }, { "epoch": 0.7400213974158506, "grad_norm": 1.492060899734497, "learning_rate": 3.3490813648403806e-05, "loss": 0.6588, "step": 1124 }, { "epoch": 0.7406797794420212, "grad_norm": 2.1554183959960938, "learning_rate": 3.333115540249475e-05, "loss": 1.2415, "step": 1125 }, { "epoch": 0.7413381614681919, "grad_norm": 0.2539469599723816, "learning_rate": 3.3171802487232086e-05, "loss": 1.194, "step": 1126 }, { "epoch": 0.7419965434943626, "grad_norm": 0.29159027338027954, "learning_rate": 3.3012755632421944e-05, "loss": 1.141, "step": 1127 }, { "epoch": 0.7426549255205332, "grad_norm": 0.30518674850463867, "learning_rate": 3.285401556646864e-05, "loss": 1.2569, "step": 1128 }, { "epoch": 0.743313307546704, "grad_norm": 0.31295838952064514, "learning_rate": 3.269558301637158e-05, "loss": 1.1932, "step": 1129 }, { "epoch": 0.7439716895728746, "grad_norm": 0.3336142599582672, "learning_rate": 3.253745870772173e-05, "loss": 1.2418, "step": 1130 }, { "epoch": 0.7446300715990454, "grad_norm": 0.3475276231765747, "learning_rate": 3.2379643364698406e-05, "loss": 1.2618, "step": 1131 }, { "epoch": 0.745288453625216, "grad_norm": 0.36097291111946106, "learning_rate": 3.2222137710065916e-05, "loss": 1.2751, "step": 1132 }, { "epoch": 0.7459468356513868, "grad_norm": 0.39613986015319824, "learning_rate": 3.206494246517022e-05, "loss": 1.2272, "step": 1133 }, { "epoch": 0.7466052176775574, "grad_norm": 0.38778916001319885, "learning_rate": 3.1908058349935696e-05, "loss": 1.1203, "step": 1134 }, { "epoch": 0.7472635997037281, "grad_norm": 0.4759044647216797, "learning_rate": 3.1751486082861836e-05, "loss": 1.3597, "step": 1135 }, { "epoch": 0.7479219817298988, "grad_norm": 0.4659407436847687, "learning_rate": 3.159522638101982e-05, "loss": 1.3581, "step": 1136 }, { "epoch": 0.7485803637560695, "grad_norm": 0.5113791227340698, "learning_rate": 3.143927996004938e-05, "loss": 1.4088, "step": 1137 }, { "epoch": 0.7492387457822401, "grad_norm": 0.5213654637336731, "learning_rate": 3.128364753415565e-05, "loss": 1.2005, "step": 1138 }, { "epoch": 0.7498971278084108, "grad_norm": 0.6035731434822083, "learning_rate": 3.112832981610545e-05, "loss": 1.2328, "step": 1139 }, { "epoch": 0.7505555098345815, "grad_norm": 0.8355278968811035, "learning_rate": 3.097332751722447e-05, "loss": 1.2166, "step": 1140 }, { "epoch": 0.7505555098345815, "eval_loss": 1.1745542287826538, "eval_runtime": 142.6978, "eval_samples_per_second": 17.926, "eval_steps_per_second": 4.485, "step": 1140 }, { "epoch": 0.7512138918607522, "grad_norm": 0.7555129528045654, "learning_rate": 3.081864134739381e-05, "loss": 1.1989, "step": 1141 }, { "epoch": 0.7518722738869229, "grad_norm": 0.8739767670631409, "learning_rate": 3.0664272015046734e-05, "loss": 1.2435, "step": 1142 }, { "epoch": 0.7525306559130935, "grad_norm": 0.9945182204246521, "learning_rate": 3.0510220227165474e-05, "loss": 1.1476, "step": 1143 }, { "epoch": 0.7531890379392643, "grad_norm": 0.9394963383674622, "learning_rate": 3.0356486689277995e-05, "loss": 0.8853, "step": 1144 }, { "epoch": 0.7538474199654349, "grad_norm": 1.3489819765090942, "learning_rate": 3.0203072105454622e-05, "loss": 1.1426, "step": 1145 }, { "epoch": 0.7545058019916057, "grad_norm": 1.6051266193389893, "learning_rate": 3.0049977178305076e-05, "loss": 1.5071, "step": 1146 }, { "epoch": 0.7551641840177763, "grad_norm": 1.4647694826126099, "learning_rate": 2.9897202608975074e-05, "loss": 1.1429, "step": 1147 }, { "epoch": 0.755822566043947, "grad_norm": 1.5680222511291504, "learning_rate": 2.974474909714304e-05, "loss": 1.1338, "step": 1148 }, { "epoch": 0.7564809480701177, "grad_norm": 1.7426420450210571, "learning_rate": 2.959261734101716e-05, "loss": 0.9512, "step": 1149 }, { "epoch": 0.7571393300962884, "grad_norm": 2.067566156387329, "learning_rate": 2.9440808037331967e-05, "loss": 1.0109, "step": 1150 }, { "epoch": 0.757797712122459, "grad_norm": 0.2545592188835144, "learning_rate": 2.9289321881345254e-05, "loss": 1.0832, "step": 1151 }, { "epoch": 0.7584560941486297, "grad_norm": 0.27604377269744873, "learning_rate": 2.9138159566834834e-05, "loss": 1.2362, "step": 1152 }, { "epoch": 0.7591144761748004, "grad_norm": 0.2910035252571106, "learning_rate": 2.8987321786095412e-05, "loss": 1.2334, "step": 1153 }, { "epoch": 0.7597728582009711, "grad_norm": 0.29931461811065674, "learning_rate": 2.8836809229935357e-05, "loss": 1.1605, "step": 1154 }, { "epoch": 0.7604312402271418, "grad_norm": 0.32173511385917664, "learning_rate": 2.8686622587673594e-05, "loss": 1.2719, "step": 1155 }, { "epoch": 0.7610896222533124, "grad_norm": 0.3663305342197418, "learning_rate": 2.8536762547136464e-05, "loss": 1.2495, "step": 1156 }, { "epoch": 0.7617480042794832, "grad_norm": 0.3718562126159668, "learning_rate": 2.8387229794654423e-05, "loss": 1.2236, "step": 1157 }, { "epoch": 0.7624063863056538, "grad_norm": 0.39510485529899597, "learning_rate": 2.823802501505909e-05, "loss": 1.2276, "step": 1158 }, { "epoch": 0.7630647683318246, "grad_norm": 0.4048973321914673, "learning_rate": 2.8089148891680084e-05, "loss": 1.2046, "step": 1159 }, { "epoch": 0.7637231503579952, "grad_norm": 0.42129042744636536, "learning_rate": 2.7940602106341707e-05, "loss": 1.3255, "step": 1160 }, { "epoch": 0.764381532384166, "grad_norm": 0.5140630602836609, "learning_rate": 2.7792385339360038e-05, "loss": 1.3431, "step": 1161 }, { "epoch": 0.7650399144103366, "grad_norm": 0.5049022436141968, "learning_rate": 2.7644499269539724e-05, "loss": 1.2449, "step": 1162 }, { "epoch": 0.7656982964365073, "grad_norm": 0.5614619851112366, "learning_rate": 2.749694457417089e-05, "loss": 1.336, "step": 1163 }, { "epoch": 0.766356678462678, "grad_norm": 0.6224168539047241, "learning_rate": 2.7349721929026005e-05, "loss": 1.2038, "step": 1164 }, { "epoch": 0.7670150604888486, "grad_norm": 0.6956164240837097, "learning_rate": 2.720283200835686e-05, "loss": 1.3149, "step": 1165 }, { "epoch": 0.7676734425150193, "grad_norm": 0.7140865921974182, "learning_rate": 2.7056275484891304e-05, "loss": 1.2284, "step": 1166 }, { "epoch": 0.76833182454119, "grad_norm": 0.8704789876937866, "learning_rate": 2.691005302983045e-05, "loss": 1.1158, "step": 1167 }, { "epoch": 0.7689902065673607, "grad_norm": 1.1140766143798828, "learning_rate": 2.6764165312845403e-05, "loss": 1.1269, "step": 1168 }, { "epoch": 0.7696485885935314, "grad_norm": 1.0424233675003052, "learning_rate": 2.661861300207412e-05, "loss": 0.944, "step": 1169 }, { "epoch": 0.7703069706197021, "grad_norm": 1.2212202548980713, "learning_rate": 2.6473396764118575e-05, "loss": 1.1607, "step": 1170 }, { "epoch": 0.7709653526458727, "grad_norm": 1.0277799367904663, "learning_rate": 2.6328517264041563e-05, "loss": 0.7129, "step": 1171 }, { "epoch": 0.7716237346720435, "grad_norm": 1.411064624786377, "learning_rate": 2.618397516536367e-05, "loss": 1.1172, "step": 1172 }, { "epoch": 0.7722821166982141, "grad_norm": 1.4100395441055298, "learning_rate": 2.6039771130060254e-05, "loss": 0.8726, "step": 1173 }, { "epoch": 0.7729404987243849, "grad_norm": 1.6474452018737793, "learning_rate": 2.589590581855843e-05, "loss": 0.8718, "step": 1174 }, { "epoch": 0.7735988807505555, "grad_norm": 1.7511224746704102, "learning_rate": 2.575237988973398e-05, "loss": 0.7599, "step": 1175 }, { "epoch": 0.7742572627767262, "grad_norm": 0.2579965889453888, "learning_rate": 2.560919400090843e-05, "loss": 1.2273, "step": 1176 }, { "epoch": 0.7749156448028969, "grad_norm": 0.2835389971733093, "learning_rate": 2.5466348807845908e-05, "loss": 1.2614, "step": 1177 }, { "epoch": 0.7755740268290676, "grad_norm": 0.29296889901161194, "learning_rate": 2.53238449647503e-05, "loss": 1.2371, "step": 1178 }, { "epoch": 0.7762324088552383, "grad_norm": 0.32429298758506775, "learning_rate": 2.5181683124262136e-05, "loss": 1.1937, "step": 1179 }, { "epoch": 0.7768907908814089, "grad_norm": 0.3148968815803528, "learning_rate": 2.503986393745564e-05, "loss": 1.1598, "step": 1180 }, { "epoch": 0.7775491729075796, "grad_norm": 0.3182394206523895, "learning_rate": 2.4898388053835763e-05, "loss": 1.1057, "step": 1181 }, { "epoch": 0.7782075549337503, "grad_norm": 0.34069356322288513, "learning_rate": 2.4757256121335183e-05, "loss": 1.2295, "step": 1182 }, { "epoch": 0.778865936959921, "grad_norm": 0.3777032494544983, "learning_rate": 2.461646878631133e-05, "loss": 1.404, "step": 1183 }, { "epoch": 0.7795243189860916, "grad_norm": 0.3982040584087372, "learning_rate": 2.4476026693543484e-05, "loss": 1.2168, "step": 1184 }, { "epoch": 0.7801827010122624, "grad_norm": 0.4333396553993225, "learning_rate": 2.4335930486229774e-05, "loss": 1.2611, "step": 1185 }, { "epoch": 0.780841083038433, "grad_norm": 0.46400609612464905, "learning_rate": 2.419618080598417e-05, "loss": 1.3248, "step": 1186 }, { "epoch": 0.7814994650646038, "grad_norm": 0.49693915247917175, "learning_rate": 2.405677829283366e-05, "loss": 1.3477, "step": 1187 }, { "epoch": 0.7821578470907744, "grad_norm": 0.5477811694145203, "learning_rate": 2.3917723585215357e-05, "loss": 1.2802, "step": 1188 }, { "epoch": 0.7828162291169452, "grad_norm": 0.5534879565238953, "learning_rate": 2.377901731997335e-05, "loss": 1.2442, "step": 1189 }, { "epoch": 0.7834746111431158, "grad_norm": 0.6362003087997437, "learning_rate": 2.3640660132355996e-05, "loss": 1.2242, "step": 1190 }, { "epoch": 0.7841329931692865, "grad_norm": 0.726051390171051, "learning_rate": 2.3502652656012947e-05, "loss": 1.2611, "step": 1191 }, { "epoch": 0.7847913751954572, "grad_norm": 0.8183614015579224, "learning_rate": 2.3364995522992227e-05, "loss": 1.1556, "step": 1192 }, { "epoch": 0.7854497572216278, "grad_norm": 0.8726836442947388, "learning_rate": 2.3227689363737382e-05, "loss": 1.2289, "step": 1193 }, { "epoch": 0.7861081392477985, "grad_norm": 1.0491002798080444, "learning_rate": 2.3090734807084548e-05, "loss": 1.3026, "step": 1194 }, { "epoch": 0.7867665212739692, "grad_norm": 1.1141643524169922, "learning_rate": 2.2954132480259504e-05, "loss": 0.9071, "step": 1195 }, { "epoch": 0.7874249033001399, "grad_norm": 1.2619067430496216, "learning_rate": 2.2817883008875063e-05, "loss": 1.0773, "step": 1196 }, { "epoch": 0.7880832853263106, "grad_norm": 1.916133165359497, "learning_rate": 2.2681987016927908e-05, "loss": 1.3466, "step": 1197 }, { "epoch": 0.7887416673524813, "grad_norm": 1.8914225101470947, "learning_rate": 2.2546445126795822e-05, "loss": 1.0075, "step": 1198 }, { "epoch": 0.7894000493786519, "grad_norm": 1.9101247787475586, "learning_rate": 2.2411257959234966e-05, "loss": 1.0738, "step": 1199 }, { "epoch": 0.7900584314048227, "grad_norm": 2.5399341583251953, "learning_rate": 2.227642613337686e-05, "loss": 1.1081, "step": 1200 }, { "epoch": 0.7907168134309933, "grad_norm": 0.2590787708759308, "learning_rate": 2.2141950266725687e-05, "loss": 1.1662, "step": 1201 }, { "epoch": 0.7913751954571641, "grad_norm": 0.31000667810440063, "learning_rate": 2.2007830975155363e-05, "loss": 1.2154, "step": 1202 }, { "epoch": 0.7920335774833347, "grad_norm": 0.28687340021133423, "learning_rate": 2.1874068872906772e-05, "loss": 1.1468, "step": 1203 }, { "epoch": 0.7926919595095054, "grad_norm": 0.3054833710193634, "learning_rate": 2.1740664572584946e-05, "loss": 1.1847, "step": 1204 }, { "epoch": 0.7933503415356761, "grad_norm": 0.32457491755485535, "learning_rate": 2.160761868515623e-05, "loss": 1.2377, "step": 1205 }, { "epoch": 0.7940087235618467, "grad_norm": 0.3366299867630005, "learning_rate": 2.1474931819945553e-05, "loss": 1.284, "step": 1206 }, { "epoch": 0.7946671055880175, "grad_norm": 0.3629007339477539, "learning_rate": 2.1342604584633506e-05, "loss": 1.2582, "step": 1207 }, { "epoch": 0.7953254876141881, "grad_norm": 0.3729316294193268, "learning_rate": 2.1210637585253703e-05, "loss": 1.2428, "step": 1208 }, { "epoch": 0.7959838696403588, "grad_norm": 0.390794575214386, "learning_rate": 2.107903142619e-05, "loss": 1.2041, "step": 1209 }, { "epoch": 0.7966422516665295, "grad_norm": 0.40834668278694153, "learning_rate": 2.0947786710173546e-05, "loss": 1.2392, "step": 1210 }, { "epoch": 0.7973006336927002, "grad_norm": 0.43642354011535645, "learning_rate": 2.081690403828024e-05, "loss": 1.2946, "step": 1211 }, { "epoch": 0.7979590157188708, "grad_norm": 0.46954917907714844, "learning_rate": 2.0686384009927838e-05, "loss": 1.3417, "step": 1212 }, { "epoch": 0.7986173977450416, "grad_norm": 0.5424888134002686, "learning_rate": 2.0556227222873313e-05, "loss": 1.4782, "step": 1213 }, { "epoch": 0.7992757797712122, "grad_norm": 0.5052427053451538, "learning_rate": 2.0426434273210014e-05, "loss": 1.18, "step": 1214 }, { "epoch": 0.799934161797383, "grad_norm": 0.5371694564819336, "learning_rate": 2.029700575536504e-05, "loss": 1.2642, "step": 1215 }, { "epoch": 0.8005925438235536, "grad_norm": 0.6594541668891907, "learning_rate": 2.0167942262096316e-05, "loss": 1.3036, "step": 1216 }, { "epoch": 0.8012509258497243, "grad_norm": 0.7874505519866943, "learning_rate": 2.003924438449023e-05, "loss": 1.1885, "step": 1217 }, { "epoch": 0.801909307875895, "grad_norm": 0.955467164516449, "learning_rate": 1.9910912711958618e-05, "loss": 1.2362, "step": 1218 }, { "epoch": 0.8025676899020657, "grad_norm": 0.9550561904907227, "learning_rate": 1.9782947832236097e-05, "loss": 1.037, "step": 1219 }, { "epoch": 0.8032260719282364, "grad_norm": 1.281804084777832, "learning_rate": 1.9655350331377566e-05, "loss": 1.0645, "step": 1220 }, { "epoch": 0.803884453954407, "grad_norm": 1.2864665985107422, "learning_rate": 1.9528120793755322e-05, "loss": 1.0086, "step": 1221 }, { "epoch": 0.8045428359805777, "grad_norm": 1.5257987976074219, "learning_rate": 1.9401259802056492e-05, "loss": 1.1066, "step": 1222 }, { "epoch": 0.8052012180067484, "grad_norm": 1.65569007396698, "learning_rate": 1.9274767937280357e-05, "loss": 1.2644, "step": 1223 }, { "epoch": 0.8058596000329191, "grad_norm": 1.5913068056106567, "learning_rate": 1.9148645778735554e-05, "loss": 0.7979, "step": 1224 }, { "epoch": 0.8065179820590898, "grad_norm": 2.411531448364258, "learning_rate": 1.9022893904037676e-05, "loss": 1.2982, "step": 1225 }, { "epoch": 0.8071763640852605, "grad_norm": 0.24755673110485077, "learning_rate": 1.889751288910645e-05, "loss": 1.1338, "step": 1226 }, { "epoch": 0.8078347461114311, "grad_norm": 0.2823069989681244, "learning_rate": 1.8772503308163046e-05, "loss": 1.2658, "step": 1227 }, { "epoch": 0.8084931281376019, "grad_norm": 0.28120023012161255, "learning_rate": 1.8647865733727643e-05, "loss": 1.1407, "step": 1228 }, { "epoch": 0.8091515101637725, "grad_norm": 0.321236789226532, "learning_rate": 1.8523600736616632e-05, "loss": 1.2701, "step": 1229 }, { "epoch": 0.8098098921899433, "grad_norm": 0.33734390139579773, "learning_rate": 1.8399708885940136e-05, "loss": 1.1863, "step": 1230 }, { "epoch": 0.8104682742161139, "grad_norm": 0.33143654465675354, "learning_rate": 1.8276190749099264e-05, "loss": 1.315, "step": 1231 }, { "epoch": 0.8111266562422846, "grad_norm": 0.3370499610900879, "learning_rate": 1.8153046891783652e-05, "loss": 1.204, "step": 1232 }, { "epoch": 0.8117850382684553, "grad_norm": 0.37255147099494934, "learning_rate": 1.8030277877968772e-05, "loss": 1.2116, "step": 1233 }, { "epoch": 0.8124434202946259, "grad_norm": 0.39773163199424744, "learning_rate": 1.790788426991339e-05, "loss": 1.3115, "step": 1234 }, { "epoch": 0.8131018023207967, "grad_norm": 0.4204883873462677, "learning_rate": 1.778586662815701e-05, "loss": 1.3576, "step": 1235 }, { "epoch": 0.8137601843469673, "grad_norm": 0.48859432339668274, "learning_rate": 1.7664225511517198e-05, "loss": 1.2549, "step": 1236 }, { "epoch": 0.814418566373138, "grad_norm": 0.4854777157306671, "learning_rate": 1.7542961477087172e-05, "loss": 1.3323, "step": 1237 }, { "epoch": 0.8150769483993087, "grad_norm": 0.5650511980056763, "learning_rate": 1.742207508023327e-05, "loss": 1.2028, "step": 1238 }, { "epoch": 0.8157353304254794, "grad_norm": 0.6029946208000183, "learning_rate": 1.7301566874592134e-05, "loss": 1.36, "step": 1239 }, { "epoch": 0.81639371245165, "grad_norm": 0.6795003414154053, "learning_rate": 1.718143741206849e-05, "loss": 1.3337, "step": 1240 }, { "epoch": 0.8170520944778208, "grad_norm": 0.7911986112594604, "learning_rate": 1.7061687242832468e-05, "loss": 1.3073, "step": 1241 }, { "epoch": 0.8177104765039914, "grad_norm": 0.9688619375228882, "learning_rate": 1.694231691531709e-05, "loss": 1.3579, "step": 1242 }, { "epoch": 0.8183688585301622, "grad_norm": 0.9323916435241699, "learning_rate": 1.6823326976215792e-05, "loss": 1.1159, "step": 1243 }, { "epoch": 0.8190272405563328, "grad_norm": 1.2399168014526367, "learning_rate": 1.670471797047992e-05, "loss": 1.2605, "step": 1244 }, { "epoch": 0.8196856225825035, "grad_norm": 1.2477198839187622, "learning_rate": 1.6586490441316116e-05, "loss": 1.3426, "step": 1245 }, { "epoch": 0.8203440046086742, "grad_norm": 1.4674113988876343, "learning_rate": 1.6468644930184095e-05, "loss": 1.0689, "step": 1246 }, { "epoch": 0.8210023866348448, "grad_norm": 1.4392623901367188, "learning_rate": 1.6351181976793894e-05, "loss": 1.0795, "step": 1247 }, { "epoch": 0.8216607686610156, "grad_norm": 1.4645551443099976, "learning_rate": 1.62341021191035e-05, "loss": 1.2043, "step": 1248 }, { "epoch": 0.8223191506871862, "grad_norm": 1.7416354417800903, "learning_rate": 1.611740589331645e-05, "loss": 0.9595, "step": 1249 }, { "epoch": 0.8229775327133569, "grad_norm": 1.685441255569458, "learning_rate": 1.6001093833879287e-05, "loss": 0.7698, "step": 1250 }, { "epoch": 0.8236359147395276, "grad_norm": 0.25341877341270447, "learning_rate": 1.588516647347916e-05, "loss": 1.2026, "step": 1251 }, { "epoch": 0.8242942967656983, "grad_norm": 0.27512264251708984, "learning_rate": 1.5769624343041355e-05, "loss": 1.2053, "step": 1252 }, { "epoch": 0.824952678791869, "grad_norm": 0.28133538365364075, "learning_rate": 1.5654467971726894e-05, "loss": 1.1509, "step": 1253 }, { "epoch": 0.8256110608180397, "grad_norm": 0.3063061535358429, "learning_rate": 1.553969788693008e-05, "loss": 1.1995, "step": 1254 }, { "epoch": 0.8262694428442103, "grad_norm": 0.3312765955924988, "learning_rate": 1.542531461427611e-05, "loss": 1.2657, "step": 1255 }, { "epoch": 0.8269278248703811, "grad_norm": 0.3586176335811615, "learning_rate": 1.531131867761866e-05, "loss": 1.1941, "step": 1256 }, { "epoch": 0.8275862068965517, "grad_norm": 0.36552894115448, "learning_rate": 1.5197710599037429e-05, "loss": 1.198, "step": 1257 }, { "epoch": 0.8282445889227225, "grad_norm": 0.384895920753479, "learning_rate": 1.5084490898835856e-05, "loss": 1.2629, "step": 1258 }, { "epoch": 0.8289029709488931, "grad_norm": 0.3710484802722931, "learning_rate": 1.497166009553871e-05, "loss": 1.2126, "step": 1259 }, { "epoch": 0.8295613529750637, "grad_norm": 0.4181410074234009, "learning_rate": 1.4859218705889588e-05, "loss": 1.2037, "step": 1260 }, { "epoch": 0.8302197350012345, "grad_norm": 0.4357868731021881, "learning_rate": 1.4747167244848714e-05, "loss": 1.185, "step": 1261 }, { "epoch": 0.8308781170274051, "grad_norm": 0.489761620759964, "learning_rate": 1.4635506225590512e-05, "loss": 1.3106, "step": 1262 }, { "epoch": 0.8315364990535759, "grad_norm": 0.5292978882789612, "learning_rate": 1.4524236159501226e-05, "loss": 1.3421, "step": 1263 }, { "epoch": 0.8321948810797465, "grad_norm": 0.5524282455444336, "learning_rate": 1.4413357556176631e-05, "loss": 1.2944, "step": 1264 }, { "epoch": 0.8328532631059172, "grad_norm": 0.6491603851318359, "learning_rate": 1.430287092341971e-05, "loss": 1.3944, "step": 1265 }, { "epoch": 0.8335116451320879, "grad_norm": 0.6646156311035156, "learning_rate": 1.4192776767238158e-05, "loss": 1.2302, "step": 1266 }, { "epoch": 0.8341700271582586, "grad_norm": 0.8558313846588135, "learning_rate": 1.408307559184242e-05, "loss": 1.252, "step": 1267 }, { "epoch": 0.8348284091844292, "grad_norm": 0.9991942644119263, "learning_rate": 1.3973767899642975e-05, "loss": 1.1287, "step": 1268 }, { "epoch": 0.8354867912106, "grad_norm": 1.0037152767181396, "learning_rate": 1.38648541912483e-05, "loss": 1.2154, "step": 1269 }, { "epoch": 0.8361451732367706, "grad_norm": 1.1659568548202515, "learning_rate": 1.3756334965462502e-05, "loss": 1.0629, "step": 1270 }, { "epoch": 0.8368035552629414, "grad_norm": 1.38374662399292, "learning_rate": 1.3648210719283028e-05, "loss": 1.0876, "step": 1271 }, { "epoch": 0.837461937289112, "grad_norm": 1.2293808460235596, "learning_rate": 1.3540481947898376e-05, "loss": 0.8495, "step": 1272 }, { "epoch": 0.8381203193152827, "grad_norm": 1.2876181602478027, "learning_rate": 1.3433149144685908e-05, "loss": 0.8809, "step": 1273 }, { "epoch": 0.8387787013414534, "grad_norm": 1.7796697616577148, "learning_rate": 1.3326212801209392e-05, "loss": 1.0504, "step": 1274 }, { "epoch": 0.839437083367624, "grad_norm": 2.186758041381836, "learning_rate": 1.3219673407217037e-05, "loss": 1.0458, "step": 1275 }, { "epoch": 0.8400954653937948, "grad_norm": 0.2571915090084076, "learning_rate": 1.311353145063905e-05, "loss": 1.1547, "step": 1276 }, { "epoch": 0.8407538474199654, "grad_norm": 0.2778959274291992, "learning_rate": 1.3007787417585372e-05, "loss": 1.1754, "step": 1277 }, { "epoch": 0.8414122294461361, "grad_norm": 0.3004729747772217, "learning_rate": 1.2902441792343612e-05, "loss": 1.2591, "step": 1278 }, { "epoch": 0.8420706114723068, "grad_norm": 0.3155894875526428, "learning_rate": 1.279749505737674e-05, "loss": 1.2791, "step": 1279 }, { "epoch": 0.8427289934984775, "grad_norm": 0.33009597659111023, "learning_rate": 1.2692947693320867e-05, "loss": 1.1802, "step": 1280 }, { "epoch": 0.8433873755246482, "grad_norm": 0.3477858603000641, "learning_rate": 1.2588800178983051e-05, "loss": 1.2065, "step": 1281 }, { "epoch": 0.8440457575508189, "grad_norm": 0.355474591255188, "learning_rate": 1.2485052991339174e-05, "loss": 1.2134, "step": 1282 }, { "epoch": 0.8447041395769895, "grad_norm": 0.3893088400363922, "learning_rate": 1.2381706605531562e-05, "loss": 1.2709, "step": 1283 }, { "epoch": 0.8453625216031603, "grad_norm": 0.39613381028175354, "learning_rate": 1.2278761494867119e-05, "loss": 1.239, "step": 1284 }, { "epoch": 0.8460209036293309, "grad_norm": 0.42657214403152466, "learning_rate": 1.2176218130814887e-05, "loss": 1.2017, "step": 1285 }, { "epoch": 0.8466792856555017, "grad_norm": 0.5077014565467834, "learning_rate": 1.2074076983003958e-05, "loss": 1.2926, "step": 1286 }, { "epoch": 0.8473376676816723, "grad_norm": 0.4870278537273407, "learning_rate": 1.1972338519221393e-05, "loss": 1.2529, "step": 1287 }, { "epoch": 0.8479960497078429, "grad_norm": 0.5146488547325134, "learning_rate": 1.1871003205410091e-05, "loss": 1.3742, "step": 1288 }, { "epoch": 0.8486544317340137, "grad_norm": 0.6352048516273499, "learning_rate": 1.1770071505666491e-05, "loss": 1.2944, "step": 1289 }, { "epoch": 0.8493128137601843, "grad_norm": 0.6424921154975891, "learning_rate": 1.166954388223862e-05, "loss": 1.232, "step": 1290 }, { "epoch": 0.849971195786355, "grad_norm": 0.745509922504425, "learning_rate": 1.1569420795523911e-05, "loss": 1.46, "step": 1291 }, { "epoch": 0.8506295778125257, "grad_norm": 0.8032955527305603, "learning_rate": 1.1469702704067064e-05, "loss": 1.1648, "step": 1292 }, { "epoch": 0.8512879598386964, "grad_norm": 0.9472469091415405, "learning_rate": 1.1370390064558034e-05, "loss": 1.2158, "step": 1293 }, { "epoch": 0.8519463418648671, "grad_norm": 1.1131627559661865, "learning_rate": 1.1271483331829835e-05, "loss": 1.1096, "step": 1294 }, { "epoch": 0.8526047238910378, "grad_norm": 1.2295578718185425, "learning_rate": 1.1172982958856482e-05, "loss": 1.0952, "step": 1295 }, { "epoch": 0.8532631059172084, "grad_norm": 1.3151978254318237, "learning_rate": 1.107488939675102e-05, "loss": 0.9785, "step": 1296 }, { "epoch": 0.8539214879433792, "grad_norm": 1.631568431854248, "learning_rate": 1.0977203094763345e-05, "loss": 1.1322, "step": 1297 }, { "epoch": 0.8545798699695498, "grad_norm": 1.512904405593872, "learning_rate": 1.0879924500278116e-05, "loss": 1.1041, "step": 1298 }, { "epoch": 0.8552382519957206, "grad_norm": 1.573190689086914, "learning_rate": 1.0783054058812858e-05, "loss": 0.8695, "step": 1299 }, { "epoch": 0.8558966340218912, "grad_norm": 1.600016474723816, "learning_rate": 1.0686592214015768e-05, "loss": 0.7234, "step": 1300 }, { "epoch": 0.8565550160480618, "grad_norm": 0.2644733488559723, "learning_rate": 1.0590539407663803e-05, "loss": 1.1294, "step": 1301 }, { "epoch": 0.8572133980742326, "grad_norm": 0.28398260474205017, "learning_rate": 1.0494896079660554e-05, "loss": 1.2221, "step": 1302 }, { "epoch": 0.8578717801004032, "grad_norm": 0.3015180230140686, "learning_rate": 1.0399662668034327e-05, "loss": 1.1769, "step": 1303 }, { "epoch": 0.858530162126574, "grad_norm": 0.295482873916626, "learning_rate": 1.0304839608936e-05, "loss": 1.1557, "step": 1304 }, { "epoch": 0.8591885441527446, "grad_norm": 0.3402240574359894, "learning_rate": 1.021042733663723e-05, "loss": 1.1766, "step": 1305 }, { "epoch": 0.8598469261789153, "grad_norm": 0.35621342062950134, "learning_rate": 1.0116426283528302e-05, "loss": 1.268, "step": 1306 }, { "epoch": 0.860505308205086, "grad_norm": 0.36093905568122864, "learning_rate": 1.0022836880116138e-05, "loss": 1.2207, "step": 1307 }, { "epoch": 0.8611636902312567, "grad_norm": 0.4131215512752533, "learning_rate": 9.929659555022442e-06, "loss": 1.3246, "step": 1308 }, { "epoch": 0.8618220722574274, "grad_norm": 0.4348328113555908, "learning_rate": 9.836894734981706e-06, "loss": 1.2316, "step": 1309 }, { "epoch": 0.8624804542835981, "grad_norm": 0.471159964799881, "learning_rate": 9.744542844839143e-06, "loss": 1.4138, "step": 1310 }, { "epoch": 0.8631388363097687, "grad_norm": 0.4972412586212158, "learning_rate": 9.652604307548895e-06, "loss": 1.3278, "step": 1311 }, { "epoch": 0.8637972183359395, "grad_norm": 0.5030047297477722, "learning_rate": 9.561079544171992e-06, "loss": 1.2477, "step": 1312 }, { "epoch": 0.8644556003621101, "grad_norm": 0.5558236241340637, "learning_rate": 9.469968973874466e-06, "loss": 1.4028, "step": 1313 }, { "epoch": 0.8651139823882807, "grad_norm": 0.6310866475105286, "learning_rate": 9.37927301392545e-06, "loss": 1.2789, "step": 1314 }, { "epoch": 0.8657723644144515, "grad_norm": 0.7040503025054932, "learning_rate": 9.28899207969518e-06, "loss": 1.1996, "step": 1315 }, { "epoch": 0.8664307464406221, "grad_norm": 0.7893826365470886, "learning_rate": 9.199126584653183e-06, "loss": 1.324, "step": 1316 }, { "epoch": 0.8670891284667929, "grad_norm": 0.9313814043998718, "learning_rate": 9.109676940366418e-06, "loss": 1.2993, "step": 1317 }, { "epoch": 0.8677475104929635, "grad_norm": 0.8821551203727722, "learning_rate": 9.02064355649721e-06, "loss": 1.0871, "step": 1318 }, { "epoch": 0.8684058925191342, "grad_norm": 1.143466591835022, "learning_rate": 8.932026840801554e-06, "loss": 1.2784, "step": 1319 }, { "epoch": 0.8690642745453049, "grad_norm": 1.2319263219833374, "learning_rate": 8.843827199127207e-06, "loss": 0.9828, "step": 1320 }, { "epoch": 0.8697226565714756, "grad_norm": 1.3100347518920898, "learning_rate": 8.756045035411753e-06, "loss": 1.0976, "step": 1321 }, { "epoch": 0.8703810385976463, "grad_norm": 1.8412470817565918, "learning_rate": 8.668680751680835e-06, "loss": 1.3813, "step": 1322 }, { "epoch": 0.871039420623817, "grad_norm": 1.6632609367370605, "learning_rate": 8.5817347480463e-06, "loss": 0.9646, "step": 1323 }, { "epoch": 0.8716978026499876, "grad_norm": 1.9535080194473267, "learning_rate": 8.495207422704298e-06, "loss": 0.9062, "step": 1324 }, { "epoch": 0.8723561846761584, "grad_norm": 1.937402606010437, "learning_rate": 8.409099171933544e-06, "loss": 0.8929, "step": 1325 }, { "epoch": 0.873014566702329, "grad_norm": 0.26802825927734375, "learning_rate": 8.323410390093522e-06, "loss": 1.1504, "step": 1326 }, { "epoch": 0.8736729487284998, "grad_norm": 0.2851593494415283, "learning_rate": 8.238141469622529e-06, "loss": 1.1977, "step": 1327 }, { "epoch": 0.8743313307546704, "grad_norm": 0.29775407910346985, "learning_rate": 8.15329280103605e-06, "loss": 1.2695, "step": 1328 }, { "epoch": 0.874989712780841, "grad_norm": 0.29565665125846863, "learning_rate": 8.068864772924855e-06, "loss": 1.0892, "step": 1329 }, { "epoch": 0.8756480948070118, "grad_norm": 0.3173798620700836, "learning_rate": 7.984857771953303e-06, "loss": 1.2387, "step": 1330 }, { "epoch": 0.8763064768331824, "grad_norm": 0.34442922472953796, "learning_rate": 7.901272182857478e-06, "loss": 1.23, "step": 1331 }, { "epoch": 0.8769648588593532, "grad_norm": 0.35819026827812195, "learning_rate": 7.818108388443546e-06, "loss": 1.2773, "step": 1332 }, { "epoch": 0.8776232408855238, "grad_norm": 0.3835400938987732, "learning_rate": 7.735366769585817e-06, "loss": 1.2615, "step": 1333 }, { "epoch": 0.8782816229116945, "grad_norm": 0.4186893701553345, "learning_rate": 7.653047705225257e-06, "loss": 1.2031, "step": 1334 }, { "epoch": 0.8789400049378652, "grad_norm": 0.4734337627887726, "learning_rate": 7.571151572367541e-06, "loss": 1.34, "step": 1335 }, { "epoch": 0.8795983869640359, "grad_norm": 0.494617760181427, "learning_rate": 7.489678746081364e-06, "loss": 1.269, "step": 1336 }, { "epoch": 0.8802567689902066, "grad_norm": 0.540384829044342, "learning_rate": 7.408629599496808e-06, "loss": 1.3307, "step": 1337 }, { "epoch": 0.8809151510163773, "grad_norm": 0.5604397058486938, "learning_rate": 7.32800450380361e-06, "loss": 1.2498, "step": 1338 }, { "epoch": 0.8815735330425479, "grad_norm": 0.6044402122497559, "learning_rate": 7.247803828249356e-06, "loss": 1.2038, "step": 1339 }, { "epoch": 0.8822319150687187, "grad_norm": 0.6669920682907104, "learning_rate": 7.168027940137922e-06, "loss": 1.1242, "step": 1340 }, { "epoch": 0.8828902970948893, "grad_norm": 0.8415902853012085, "learning_rate": 7.088677204827721e-06, "loss": 1.3357, "step": 1341 }, { "epoch": 0.8835486791210599, "grad_norm": 0.9297363758087158, "learning_rate": 7.009751985730062e-06, "loss": 1.2527, "step": 1342 }, { "epoch": 0.8842070611472307, "grad_norm": 1.0265207290649414, "learning_rate": 6.93125264430744e-06, "loss": 1.0002, "step": 1343 }, { "epoch": 0.8848654431734013, "grad_norm": 1.0541255474090576, "learning_rate": 6.853179540071964e-06, "loss": 1.104, "step": 1344 }, { "epoch": 0.8855238251995721, "grad_norm": 1.1322312355041504, "learning_rate": 6.775533030583559e-06, "loss": 0.8635, "step": 1345 }, { "epoch": 0.8861822072257427, "grad_norm": 1.4108020067214966, "learning_rate": 6.698313471448547e-06, "loss": 1.1874, "step": 1346 }, { "epoch": 0.8868405892519134, "grad_norm": 1.4998079538345337, "learning_rate": 6.62152121631785e-06, "loss": 1.2854, "step": 1347 }, { "epoch": 0.8874989712780841, "grad_norm": 2.0073752403259277, "learning_rate": 6.545156616885373e-06, "loss": 1.1182, "step": 1348 }, { "epoch": 0.8881573533042548, "grad_norm": 1.9627798795700073, "learning_rate": 6.469220022886491e-06, "loss": 1.0721, "step": 1349 }, { "epoch": 0.8888157353304255, "grad_norm": 2.0956573486328125, "learning_rate": 6.39371178209639e-06, "loss": 1.06, "step": 1350 }, { "epoch": 0.8894741173565962, "grad_norm": 0.26062607765197754, "learning_rate": 6.31863224032847e-06, "loss": 1.2184, "step": 1351 }, { "epoch": 0.8901324993827668, "grad_norm": 0.2938222289085388, "learning_rate": 6.24398174143277e-06, "loss": 1.262, "step": 1352 }, { "epoch": 0.8907908814089376, "grad_norm": 0.2993498146533966, "learning_rate": 6.169760627294408e-06, "loss": 1.0936, "step": 1353 }, { "epoch": 0.8914492634351082, "grad_norm": 0.3110993802547455, "learning_rate": 6.095969237831956e-06, "loss": 1.189, "step": 1354 }, { "epoch": 0.8921076454612789, "grad_norm": 0.3342379927635193, "learning_rate": 6.022607910996014e-06, "loss": 1.1589, "step": 1355 }, { "epoch": 0.8927660274874496, "grad_norm": 0.34470638632774353, "learning_rate": 5.949676982767505e-06, "loss": 1.2739, "step": 1356 }, { "epoch": 0.8934244095136202, "grad_norm": 0.3623284101486206, "learning_rate": 5.877176787156213e-06, "loss": 1.2129, "step": 1357 }, { "epoch": 0.894082791539791, "grad_norm": 0.3873266875743866, "learning_rate": 5.805107656199271e-06, "loss": 1.2821, "step": 1358 }, { "epoch": 0.8947411735659616, "grad_norm": 0.4302651882171631, "learning_rate": 5.733469919959622e-06, "loss": 1.2724, "step": 1359 }, { "epoch": 0.8953995555921324, "grad_norm": 0.44161009788513184, "learning_rate": 5.66226390652449e-06, "loss": 1.33, "step": 1360 }, { "epoch": 0.896057937618303, "grad_norm": 0.4458579421043396, "learning_rate": 5.591489942003902e-06, "loss": 1.3203, "step": 1361 }, { "epoch": 0.8967163196444737, "grad_norm": 0.50836181640625, "learning_rate": 5.521148350529137e-06, "loss": 1.4751, "step": 1362 }, { "epoch": 0.8973747016706444, "grad_norm": 0.5364328026771545, "learning_rate": 5.451239454251356e-06, "loss": 1.3369, "step": 1363 }, { "epoch": 0.8980330836968151, "grad_norm": 0.5729132890701294, "learning_rate": 5.381763573340049e-06, "loss": 1.2195, "step": 1364 }, { "epoch": 0.8986914657229857, "grad_norm": 0.5974079966545105, "learning_rate": 5.312721025981504e-06, "loss": 1.2283, "step": 1365 }, { "epoch": 0.8993498477491565, "grad_norm": 0.6872406601905823, "learning_rate": 5.244112128377476e-06, "loss": 1.3682, "step": 1366 }, { "epoch": 0.9000082297753271, "grad_norm": 0.8229653239250183, "learning_rate": 5.1759371947437516e-06, "loss": 1.3575, "step": 1367 }, { "epoch": 0.9006666118014979, "grad_norm": 0.83298259973526, "learning_rate": 5.108196537308507e-06, "loss": 1.1735, "step": 1368 }, { "epoch": 0.9013249938276685, "grad_norm": 1.0485754013061523, "learning_rate": 5.040890466311121e-06, "loss": 1.1567, "step": 1369 }, { "epoch": 0.9019833758538391, "grad_norm": 1.270142912864685, "learning_rate": 4.97401929000062e-06, "loss": 1.1402, "step": 1370 }, { "epoch": 0.9026417578800099, "grad_norm": 1.2949579954147339, "learning_rate": 4.907583314634267e-06, "loss": 0.8752, "step": 1371 }, { "epoch": 0.9033001399061805, "grad_norm": 1.4545363187789917, "learning_rate": 4.841582844476245e-06, "loss": 1.0948, "step": 1372 }, { "epoch": 0.9039585219323513, "grad_norm": 1.7462900876998901, "learning_rate": 4.7760181817961604e-06, "loss": 1.0682, "step": 1373 }, { "epoch": 0.9046169039585219, "grad_norm": 1.53018319606781, "learning_rate": 4.710889626867687e-06, "loss": 0.8449, "step": 1374 }, { "epoch": 0.9052752859846926, "grad_norm": 2.336136817932129, "learning_rate": 4.646197477967251e-06, "loss": 1.2089, "step": 1375 }, { "epoch": 0.9059336680108633, "grad_norm": 0.24908111989498138, "learning_rate": 4.5819420313726545e-06, "loss": 1.1893, "step": 1376 }, { "epoch": 0.906592050037034, "grad_norm": 0.2742631733417511, "learning_rate": 4.518123581361577e-06, "loss": 1.2126, "step": 1377 }, { "epoch": 0.9072504320632047, "grad_norm": 0.29479411244392395, "learning_rate": 4.4547424202104335e-06, "loss": 1.1572, "step": 1378 }, { "epoch": 0.9079088140893754, "grad_norm": 0.299268901348114, "learning_rate": 4.391798838192884e-06, "loss": 1.1414, "step": 1379 }, { "epoch": 0.908567196115546, "grad_norm": 0.32361260056495667, "learning_rate": 4.329293123578604e-06, "loss": 1.2059, "step": 1380 }, { "epoch": 0.9092255781417168, "grad_norm": 0.35213223099708557, "learning_rate": 4.267225562631871e-06, "loss": 1.1661, "step": 1381 }, { "epoch": 0.9098839601678874, "grad_norm": 0.36282479763031006, "learning_rate": 4.2055964396103486e-06, "loss": 1.2415, "step": 1382 }, { "epoch": 0.910542342194058, "grad_norm": 0.39451318979263306, "learning_rate": 4.1444060367636925e-06, "loss": 1.1791, "step": 1383 }, { "epoch": 0.9112007242202288, "grad_norm": 0.4231424927711487, "learning_rate": 4.083654634332334e-06, "loss": 1.3214, "step": 1384 }, { "epoch": 0.9118591062463994, "grad_norm": 0.4535464942455292, "learning_rate": 4.023342510546191e-06, "loss": 1.2253, "step": 1385 }, { "epoch": 0.9125174882725702, "grad_norm": 0.5160973072052002, "learning_rate": 3.963469941623288e-06, "loss": 1.2891, "step": 1386 }, { "epoch": 0.9131758702987408, "grad_norm": 0.5102490782737732, "learning_rate": 3.904037201768618e-06, "loss": 1.2608, "step": 1387 }, { "epoch": 0.9138342523249116, "grad_norm": 0.5157856941223145, "learning_rate": 3.845044563172895e-06, "loss": 1.3482, "step": 1388 }, { "epoch": 0.9144926343510822, "grad_norm": 0.576413094997406, "learning_rate": 3.786492296011157e-06, "loss": 1.2207, "step": 1389 }, { "epoch": 0.9151510163772529, "grad_norm": 0.652743399143219, "learning_rate": 3.7283806684416776e-06, "loss": 1.3499, "step": 1390 }, { "epoch": 0.9158093984034236, "grad_norm": 0.7483471035957336, "learning_rate": 3.670709946604667e-06, "loss": 1.3639, "step": 1391 }, { "epoch": 0.9164677804295943, "grad_norm": 0.7818670868873596, "learning_rate": 3.6134803946210937e-06, "loss": 1.1841, "step": 1392 }, { "epoch": 0.917126162455765, "grad_norm": 0.8835737109184265, "learning_rate": 3.5566922745914335e-06, "loss": 1.0998, "step": 1393 }, { "epoch": 0.9177845444819357, "grad_norm": 1.2403169870376587, "learning_rate": 3.5003458465944882e-06, "loss": 1.5137, "step": 1394 }, { "epoch": 0.9184429265081063, "grad_norm": 1.1381645202636719, "learning_rate": 3.444441368686191e-06, "loss": 1.069, "step": 1395 }, { "epoch": 0.919101308534277, "grad_norm": 1.4728044271469116, "learning_rate": 3.388979096898415e-06, "loss": 1.2426, "step": 1396 }, { "epoch": 0.9197596905604477, "grad_norm": 1.2873393297195435, "learning_rate": 3.3339592852378864e-06, "loss": 0.962, "step": 1397 }, { "epoch": 0.9204180725866183, "grad_norm": 1.5346226692199707, "learning_rate": 3.2793821856848426e-06, "loss": 0.9109, "step": 1398 }, { "epoch": 0.9210764546127891, "grad_norm": 1.9029617309570312, "learning_rate": 3.225248048192053e-06, "loss": 1.1107, "step": 1399 }, { "epoch": 0.9217348366389597, "grad_norm": 1.7355810403823853, "learning_rate": 3.171557120683588e-06, "loss": 1.0064, "step": 1400 }, { "epoch": 0.9223932186651305, "grad_norm": 0.2773732841014862, "learning_rate": 3.118309649053686e-06, "loss": 1.2105, "step": 1401 }, { "epoch": 0.9230516006913011, "grad_norm": 0.2957621216773987, "learning_rate": 3.065505877165675e-06, "loss": 1.2446, "step": 1402 }, { "epoch": 0.9237099827174718, "grad_norm": 0.31012555956840515, "learning_rate": 3.01314604685079e-06, "loss": 1.246, "step": 1403 }, { "epoch": 0.9243683647436425, "grad_norm": 0.3149036169052124, "learning_rate": 2.9612303979071e-06, "loss": 1.161, "step": 1404 }, { "epoch": 0.9250267467698132, "grad_norm": 0.35212263464927673, "learning_rate": 2.909759168098436e-06, "loss": 1.2855, "step": 1405 }, { "epoch": 0.9256851287959839, "grad_norm": 0.3609144389629364, "learning_rate": 2.858732593153246e-06, "loss": 1.1838, "step": 1406 }, { "epoch": 0.9263435108221546, "grad_norm": 0.3796745538711548, "learning_rate": 2.8081509067635516e-06, "loss": 1.2689, "step": 1407 }, { "epoch": 0.9270018928483252, "grad_norm": 0.38216182589530945, "learning_rate": 2.7580143405838586e-06, "loss": 1.1406, "step": 1408 }, { "epoch": 0.9276602748744959, "grad_norm": 0.4440428912639618, "learning_rate": 2.708323124230139e-06, "loss": 1.2299, "step": 1409 }, { "epoch": 0.9283186569006666, "grad_norm": 0.4657718241214752, "learning_rate": 2.659077485278716e-06, "loss": 1.2485, "step": 1410 }, { "epoch": 0.9289770389268373, "grad_norm": 0.5070340037345886, "learning_rate": 2.6102776492652805e-06, "loss": 1.4255, "step": 1411 }, { "epoch": 0.929635420953008, "grad_norm": 0.5538939237594604, "learning_rate": 2.5619238396837662e-06, "loss": 1.4407, "step": 1412 }, { "epoch": 0.9302938029791786, "grad_norm": 0.5763673186302185, "learning_rate": 2.514016277985487e-06, "loss": 1.2522, "step": 1413 }, { "epoch": 0.9309521850053494, "grad_norm": 0.6714739203453064, "learning_rate": 2.466555183577968e-06, "loss": 1.2696, "step": 1414 }, { "epoch": 0.93161056703152, "grad_norm": 0.7577025890350342, "learning_rate": 2.419540773823992e-06, "loss": 1.2229, "step": 1415 }, { "epoch": 0.9322689490576908, "grad_norm": 1.0093814134597778, "learning_rate": 2.3729732640406233e-06, "loss": 1.2574, "step": 1416 }, { "epoch": 0.9329273310838614, "grad_norm": 1.0037925243377686, "learning_rate": 2.32685286749823e-06, "loss": 1.1359, "step": 1417 }, { "epoch": 0.9335857131100321, "grad_norm": 1.3896335363388062, "learning_rate": 2.2811797954194524e-06, "loss": 1.2697, "step": 1418 }, { "epoch": 0.9342440951362028, "grad_norm": 1.2368346452713013, "learning_rate": 2.235954256978301e-06, "loss": 1.1392, "step": 1419 }, { "epoch": 0.9349024771623735, "grad_norm": 1.3703478574752808, "learning_rate": 2.191176459299138e-06, "loss": 1.1208, "step": 1420 }, { "epoch": 0.9355608591885441, "grad_norm": 1.3063801527023315, "learning_rate": 2.146846607455788e-06, "loss": 0.9792, "step": 1421 }, { "epoch": 0.9362192412147149, "grad_norm": 1.5417190790176392, "learning_rate": 2.1029649044705503e-06, "loss": 1.1611, "step": 1422 }, { "epoch": 0.9368776232408855, "grad_norm": 1.48011314868927, "learning_rate": 2.059531551313287e-06, "loss": 0.8745, "step": 1423 }, { "epoch": 0.9375360052670562, "grad_norm": 1.7629079818725586, "learning_rate": 2.016546746900505e-06, "loss": 1.0266, "step": 1424 }, { "epoch": 0.9381943872932269, "grad_norm": 2.372678518295288, "learning_rate": 1.9740106880944297e-06, "loss": 1.2707, "step": 1425 }, { "epoch": 0.9388527693193975, "grad_norm": 0.26507827639579773, "learning_rate": 1.9319235697021763e-06, "loss": 1.2121, "step": 1426 }, { "epoch": 0.9395111513455683, "grad_norm": 0.29203614592552185, "learning_rate": 1.8902855844747047e-06, "loss": 1.1962, "step": 1427 }, { "epoch": 0.9401695333717389, "grad_norm": 0.3047432601451874, "learning_rate": 1.8490969231061084e-06, "loss": 1.2417, "step": 1428 }, { "epoch": 0.9408279153979097, "grad_norm": 0.314283162355423, "learning_rate": 1.8083577742326163e-06, "loss": 1.1419, "step": 1429 }, { "epoch": 0.9414862974240803, "grad_norm": 0.32703661918640137, "learning_rate": 1.7680683244318152e-06, "loss": 1.1603, "step": 1430 }, { "epoch": 0.942144679450251, "grad_norm": 0.3614916503429413, "learning_rate": 1.728228758221706e-06, "loss": 1.1844, "step": 1431 }, { "epoch": 0.9428030614764217, "grad_norm": 0.3773248493671417, "learning_rate": 1.688839258059971e-06, "loss": 1.328, "step": 1432 }, { "epoch": 0.9434614435025924, "grad_norm": 0.42976126074790955, "learning_rate": 1.6499000043429968e-06, "loss": 1.3348, "step": 1433 }, { "epoch": 0.9441198255287631, "grad_norm": 0.4219948947429657, "learning_rate": 1.6114111754051974e-06, "loss": 1.1941, "step": 1434 }, { "epoch": 0.9447782075549338, "grad_norm": 0.4261309802532196, "learning_rate": 1.5733729475181036e-06, "loss": 1.2301, "step": 1435 }, { "epoch": 0.9454365895811044, "grad_norm": 0.46671414375305176, "learning_rate": 1.5357854948895634e-06, "loss": 1.2488, "step": 1436 }, { "epoch": 0.9460949716072751, "grad_norm": 0.5037201642990112, "learning_rate": 1.4986489896629651e-06, "loss": 1.2346, "step": 1437 }, { "epoch": 0.9467533536334458, "grad_norm": 0.5437148809432983, "learning_rate": 1.4619636019164606e-06, "loss": 1.2866, "step": 1438 }, { "epoch": 0.9474117356596164, "grad_norm": 0.6370693445205688, "learning_rate": 1.425729499662154e-06, "loss": 1.2668, "step": 1439 }, { "epoch": 0.9480701176857872, "grad_norm": 0.6582576036453247, "learning_rate": 1.3899468488453583e-06, "loss": 1.215, "step": 1440 }, { "epoch": 0.9487284997119578, "grad_norm": 0.7763570547103882, "learning_rate": 1.354615813343818e-06, "loss": 1.1654, "step": 1441 }, { "epoch": 0.9493868817381286, "grad_norm": 1.096980094909668, "learning_rate": 1.319736554966955e-06, "loss": 1.2844, "step": 1442 }, { "epoch": 0.9500452637642992, "grad_norm": 0.9722412824630737, "learning_rate": 1.2853092334551452e-06, "loss": 1.1631, "step": 1443 }, { "epoch": 0.95070364579047, "grad_norm": 1.2841540575027466, "learning_rate": 1.2513340064790102e-06, "loss": 1.0249, "step": 1444 }, { "epoch": 0.9513620278166406, "grad_norm": 1.3519368171691895, "learning_rate": 1.2178110296386157e-06, "loss": 1.2195, "step": 1445 }, { "epoch": 0.9520204098428113, "grad_norm": 1.3257054090499878, "learning_rate": 1.1847404564628185e-06, "loss": 1.1797, "step": 1446 }, { "epoch": 0.952678791868982, "grad_norm": 1.405526876449585, "learning_rate": 1.1521224384085871e-06, "loss": 0.8704, "step": 1447 }, { "epoch": 0.9533371738951527, "grad_norm": 1.6719517707824707, "learning_rate": 1.1199571248602382e-06, "loss": 1.0272, "step": 1448 }, { "epoch": 0.9539955559213233, "grad_norm": 1.8807095289230347, "learning_rate": 1.0882446631287902e-06, "loss": 0.8809, "step": 1449 }, { "epoch": 0.954653937947494, "grad_norm": 2.01116943359375, "learning_rate": 1.0569851984513103e-06, "loss": 0.9926, "step": 1450 }, { "epoch": 0.9553123199736647, "grad_norm": 0.26615461707115173, "learning_rate": 1.0261788739902022e-06, "loss": 1.2286, "step": 1451 }, { "epoch": 0.9559707019998354, "grad_norm": 0.28414100408554077, "learning_rate": 9.958258308325973e-07, "loss": 1.2566, "step": 1452 }, { "epoch": 0.9566290840260061, "grad_norm": 0.28735122084617615, "learning_rate": 9.659262079896314e-07, "loss": 1.1613, "step": 1453 }, { "epoch": 0.9572874660521767, "grad_norm": 0.31062746047973633, "learning_rate": 9.364801423959235e-07, "loss": 1.2663, "step": 1454 }, { "epoch": 0.9579458480783475, "grad_norm": 0.331766813993454, "learning_rate": 9.074877689088768e-07, "loss": 1.2882, "step": 1455 }, { "epoch": 0.9586042301045181, "grad_norm": 0.34377574920654297, "learning_rate": 8.78949220308023e-07, "loss": 1.2723, "step": 1456 }, { "epoch": 0.9592626121306889, "grad_norm": 0.35146182775497437, "learning_rate": 8.50864627294512e-07, "loss": 1.1882, "step": 1457 }, { "epoch": 0.9599209941568595, "grad_norm": 0.37997257709503174, "learning_rate": 8.232341184904457e-07, "loss": 1.2171, "step": 1458 }, { "epoch": 0.9605793761830302, "grad_norm": 0.4185076057910919, "learning_rate": 7.960578204383007e-07, "loss": 1.2212, "step": 1459 }, { "epoch": 0.9612377582092009, "grad_norm": 0.4693465828895569, "learning_rate": 7.693358576003617e-07, "loss": 1.2836, "step": 1460 }, { "epoch": 0.9618961402353716, "grad_norm": 0.4621378779411316, "learning_rate": 7.430683523581561e-07, "loss": 1.2892, "step": 1461 }, { "epoch": 0.9625545222615423, "grad_norm": 0.49882352352142334, "learning_rate": 7.172554250118535e-07, "loss": 1.2688, "step": 1462 }, { "epoch": 0.9632129042877129, "grad_norm": 0.5431182384490967, "learning_rate": 6.91897193779767e-07, "loss": 1.3595, "step": 1463 }, { "epoch": 0.9638712863138836, "grad_norm": 0.6177190542221069, "learning_rate": 6.66993774797775e-07, "loss": 1.3173, "step": 1464 }, { "epoch": 0.9645296683400543, "grad_norm": 0.6704779863357544, "learning_rate": 6.425452821188116e-07, "loss": 1.2451, "step": 1465 }, { "epoch": 0.965188050366225, "grad_norm": 0.7778468132019043, "learning_rate": 6.185518277123214e-07, "loss": 1.1512, "step": 1466 }, { "epoch": 0.9658464323923956, "grad_norm": 0.7778995633125305, "learning_rate": 5.950135214637831e-07, "loss": 1.1375, "step": 1467 }, { "epoch": 0.9665048144185664, "grad_norm": 0.8882863521575928, "learning_rate": 5.719304711741535e-07, "loss": 1.2534, "step": 1468 }, { "epoch": 0.967163196444737, "grad_norm": 1.0903034210205078, "learning_rate": 5.493027825594244e-07, "loss": 1.0416, "step": 1469 }, { "epoch": 0.9678215784709078, "grad_norm": 1.4687763452529907, "learning_rate": 5.271305592501108e-07, "loss": 1.0645, "step": 1470 }, { "epoch": 0.9684799604970784, "grad_norm": 1.4143941402435303, "learning_rate": 5.054139027907967e-07, "loss": 1.0708, "step": 1471 }, { "epoch": 0.9691383425232492, "grad_norm": 1.5318214893341064, "learning_rate": 4.841529126396238e-07, "loss": 0.9667, "step": 1472 }, { "epoch": 0.9697967245494198, "grad_norm": 1.5767490863800049, "learning_rate": 4.633476861679142e-07, "loss": 0.7994, "step": 1473 }, { "epoch": 0.9704551065755905, "grad_norm": 1.8559094667434692, "learning_rate": 4.429983186596265e-07, "loss": 1.096, "step": 1474 }, { "epoch": 0.9711134886017612, "grad_norm": 2.398397922515869, "learning_rate": 4.2310490331102237e-07, "loss": 1.0258, "step": 1475 }, { "epoch": 0.9717718706279319, "grad_norm": 0.27290716767311096, "learning_rate": 4.036675312301452e-07, "loss": 1.1712, "step": 1476 }, { "epoch": 0.9724302526541025, "grad_norm": 0.27547338604927063, "learning_rate": 3.846862914364868e-07, "loss": 1.1352, "step": 1477 }, { "epoch": 0.9730886346802732, "grad_norm": 0.3008437752723694, "learning_rate": 3.6616127086051e-07, "loss": 1.2512, "step": 1478 }, { "epoch": 0.9737470167064439, "grad_norm": 0.3131862282752991, "learning_rate": 3.4809255434328227e-07, "loss": 1.217, "step": 1479 }, { "epoch": 0.9744053987326146, "grad_norm": 0.32987430691719055, "learning_rate": 3.3048022463612047e-07, "loss": 1.2212, "step": 1480 }, { "epoch": 0.9750637807587853, "grad_norm": 0.36609992384910583, "learning_rate": 3.1332436240011364e-07, "loss": 1.2002, "step": 1481 }, { "epoch": 0.9757221627849559, "grad_norm": 0.38834360241889954, "learning_rate": 2.9662504620588947e-07, "loss": 1.2311, "step": 1482 }, { "epoch": 0.9763805448111267, "grad_norm": 0.3734608292579651, "learning_rate": 2.8038235253311505e-07, "loss": 1.15, "step": 1483 }, { "epoch": 0.9770389268372973, "grad_norm": 0.39932605624198914, "learning_rate": 2.6459635577026353e-07, "loss": 1.1955, "step": 1484 }, { "epoch": 0.9776973088634681, "grad_norm": 0.4378267228603363, "learning_rate": 2.492671282141923e-07, "loss": 1.1007, "step": 1485 }, { "epoch": 0.9783556908896387, "grad_norm": 0.4685000479221344, "learning_rate": 2.343947400698432e-07, "loss": 1.2366, "step": 1486 }, { "epoch": 0.9790140729158094, "grad_norm": 0.5255098938941956, "learning_rate": 2.199792594499428e-07, "loss": 1.3388, "step": 1487 }, { "epoch": 0.9796724549419801, "grad_norm": 0.5867619514465332, "learning_rate": 2.0602075237465823e-07, "loss": 1.2941, "step": 1488 }, { "epoch": 0.9803308369681508, "grad_norm": 0.5758997201919556, "learning_rate": 1.9251928277128628e-07, "loss": 1.1415, "step": 1489 }, { "epoch": 0.9809892189943215, "grad_norm": 0.8498535752296448, "learning_rate": 1.7947491247399806e-07, "loss": 1.2043, "step": 1490 }, { "epoch": 0.9816476010204921, "grad_norm": 0.8512412309646606, "learning_rate": 1.6688770122353925e-07, "loss": 1.3614, "step": 1491 }, { "epoch": 0.9823059830466628, "grad_norm": 0.9369918704032898, "learning_rate": 1.5475770666694144e-07, "loss": 1.1451, "step": 1492 }, { "epoch": 0.9829643650728335, "grad_norm": 1.0150845050811768, "learning_rate": 1.430849843572779e-07, "loss": 1.2014, "step": 1493 }, { "epoch": 0.9836227470990042, "grad_norm": 1.2079380750656128, "learning_rate": 1.318695877533971e-07, "loss": 1.3343, "step": 1494 }, { "epoch": 0.9842811291251748, "grad_norm": 1.062026858329773, "learning_rate": 1.211115682196895e-07, "loss": 0.9174, "step": 1495 }, { "epoch": 0.9849395111513456, "grad_norm": 1.1822501420974731, "learning_rate": 1.1081097502584348e-07, "loss": 0.9607, "step": 1496 }, { "epoch": 0.9855978931775162, "grad_norm": 1.4208043813705444, "learning_rate": 1.0096785534660092e-07, "loss": 0.9065, "step": 1497 }, { "epoch": 0.986256275203687, "grad_norm": 1.4563997983932495, "learning_rate": 9.158225426160183e-08, "loss": 0.9299, "step": 1498 }, { "epoch": 0.9869146572298576, "grad_norm": 3.9852182865142822, "learning_rate": 8.265421475511792e-08, "loss": 1.2768, "step": 1499 }, { "epoch": 0.9875730392560284, "grad_norm": 2.2535030841827393, "learning_rate": 7.418377771585273e-08, "loss": 1.2758, "step": 1500 }, { "epoch": 0.988231421282199, "grad_norm": 0.26907411217689514, "learning_rate": 6.617098193681947e-08, "loss": 1.2956, "step": 1501 }, { "epoch": 0.9888898033083697, "grad_norm": 0.2853264808654785, "learning_rate": 5.86158641150969e-08, "loss": 1.1471, "step": 1502 }, { "epoch": 0.9895481853345404, "grad_norm": 0.3077790439128876, "learning_rate": 5.151845885167372e-08, "loss": 1.2542, "step": 1503 }, { "epoch": 0.990206567360711, "grad_norm": 0.33259809017181396, "learning_rate": 4.487879865133771e-08, "loss": 1.1694, "step": 1504 }, { "epoch": 0.9908649493868817, "grad_norm": 0.35665038228034973, "learning_rate": 3.8696913922475764e-08, "loss": 1.264, "step": 1505 }, { "epoch": 0.9915233314130524, "grad_norm": 0.38716834783554077, "learning_rate": 3.2972832976918554e-08, "loss": 1.3199, "step": 1506 }, { "epoch": 0.9921817134392231, "grad_norm": 0.43460676074028015, "learning_rate": 2.7706582029896068e-08, "loss": 1.2493, "step": 1507 }, { "epoch": 0.9928400954653938, "grad_norm": 0.4777851104736328, "learning_rate": 2.2898185199826673e-08, "loss": 1.3686, "step": 1508 }, { "epoch": 0.9934984774915645, "grad_norm": 0.5349164009094238, "learning_rate": 1.854766450826162e-08, "loss": 1.2081, "step": 1509 }, { "epoch": 0.9941568595177351, "grad_norm": 0.5055113434791565, "learning_rate": 1.4655039879740706e-08, "loss": 1.0484, "step": 1510 }, { "epoch": 0.9948152415439059, "grad_norm": 0.6259576678276062, "learning_rate": 1.122032914177007e-08, "loss": 1.2071, "step": 1511 }, { "epoch": 0.9954736235700765, "grad_norm": 0.7254166007041931, "learning_rate": 8.243548024655656e-09, "loss": 1.3634, "step": 1512 }, { "epoch": 0.9961320055962473, "grad_norm": 0.7936909794807434, "learning_rate": 5.72471016149212e-09, "loss": 1.2007, "step": 1513 }, { "epoch": 0.9967903876224179, "grad_norm": 0.9308106303215027, "learning_rate": 3.663827088085103e-09, "loss": 1.0131, "step": 1514 }, { "epoch": 0.9974487696485886, "grad_norm": 1.1433625221252441, "learning_rate": 2.060908242873527e-09, "loss": 1.1215, "step": 1515 }, { "epoch": 0.9981071516747593, "grad_norm": 1.2838548421859741, "learning_rate": 9.159609669406877e-10, "loss": 1.1398, "step": 1516 }, { "epoch": 0.99876553370093, "grad_norm": 1.4757442474365234, "learning_rate": 2.2899050391433918e-10, "loss": 1.1327, "step": 1517 }, { "epoch": 0.9994239157271007, "grad_norm": 1.396567463874817, "learning_rate": 0.0, "loss": 0.6568, "step": 1518 } ], "logging_steps": 1, "max_steps": 1518, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 380, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.175202288478716e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }