{ "best_global_step": 10026, "best_metric": 2.109537124633789, "best_model_checkpoint": "./sft_output_SimpleQA/checkpoint-10026", "epoch": 3.0, "eval_steps": 500, "global_step": 10026, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002992220227408737, "grad_norm": 215.0, "learning_rate": 4.9955116696588873e-05, "loss": 6.5961, "mean_token_accuracy": 0.14261817783117295, "num_tokens": 4118.0, "step": 10 }, { "epoch": 0.005984440454817474, "grad_norm": 28.5, "learning_rate": 4.990524635946539e-05, "loss": 6.8891, "mean_token_accuracy": 0.023234939482063056, "num_tokens": 10291.0, "step": 20 }, { "epoch": 0.008976660682226212, "grad_norm": 34.25, "learning_rate": 4.9855376022341915e-05, "loss": 6.4879, "mean_token_accuracy": 0.020476288627833128, "num_tokens": 14655.0, "step": 30 }, { "epoch": 0.011968880909634948, "grad_norm": 35.0, "learning_rate": 4.980550568521843e-05, "loss": 6.5951, "mean_token_accuracy": 0.021896280348300934, "num_tokens": 17680.0, "step": 40 }, { "epoch": 0.014961101137043686, "grad_norm": 40.75, "learning_rate": 4.975563534809496e-05, "loss": 6.5836, "mean_token_accuracy": 0.027274718414992093, "num_tokens": 20903.0, "step": 50 }, { "epoch": 0.017953321364452424, "grad_norm": 62.5, "learning_rate": 4.970576501097148e-05, "loss": 6.4953, "mean_token_accuracy": 0.036431427439674736, "num_tokens": 25889.0, "step": 60 }, { "epoch": 0.020945541591861162, "grad_norm": 20.75, "learning_rate": 4.9655894673848e-05, "loss": 6.3477, "mean_token_accuracy": 0.038724389113485816, "num_tokens": 29097.0, "step": 70 }, { "epoch": 0.023937761819269897, "grad_norm": 28.0, "learning_rate": 4.9606024336724516e-05, "loss": 6.3711, "mean_token_accuracy": 0.0324508004821837, "num_tokens": 33201.0, "step": 80 }, { "epoch": 0.026929982046678635, "grad_norm": 30.0, "learning_rate": 4.955615399960104e-05, "loss": 6.0164, "mean_token_accuracy": 0.049895737878978254, "num_tokens": 37509.0, "step": 90 }, { "epoch": 0.029922202274087373, "grad_norm": 14.875, "learning_rate": 4.950628366247756e-05, "loss": 6.218, "mean_token_accuracy": 0.0485645480453968, "num_tokens": 41623.0, "step": 100 }, { "epoch": 0.03291442250149611, "grad_norm": 25.375, "learning_rate": 4.945641332535408e-05, "loss": 6.014, "mean_token_accuracy": 0.05698083452880383, "num_tokens": 46601.0, "step": 110 }, { "epoch": 0.03590664272890485, "grad_norm": 20.5, "learning_rate": 4.9406542988230606e-05, "loss": 5.9919, "mean_token_accuracy": 0.05561460051685572, "num_tokens": 50342.0, "step": 120 }, { "epoch": 0.03889886295631358, "grad_norm": 20.875, "learning_rate": 4.935667265110712e-05, "loss": 6.166, "mean_token_accuracy": 0.04003935307264328, "num_tokens": 55021.0, "step": 130 }, { "epoch": 0.041891083183722325, "grad_norm": 22.375, "learning_rate": 4.930680231398365e-05, "loss": 6.0082, "mean_token_accuracy": 0.05440298151224852, "num_tokens": 58986.0, "step": 140 }, { "epoch": 0.04488330341113106, "grad_norm": 22.875, "learning_rate": 4.9256931976860165e-05, "loss": 5.9211, "mean_token_accuracy": 0.07408283576369286, "num_tokens": 62081.0, "step": 150 }, { "epoch": 0.047875523638539794, "grad_norm": 30.5, "learning_rate": 4.920706163973668e-05, "loss": 5.6951, "mean_token_accuracy": 0.10848762802779674, "num_tokens": 64439.0, "step": 160 }, { "epoch": 0.050867743865948535, "grad_norm": 15.4375, "learning_rate": 4.9157191302613206e-05, "loss": 5.9625, "mean_token_accuracy": 0.06950367838144303, "num_tokens": 68866.0, "step": 170 }, { "epoch": 0.05385996409335727, "grad_norm": 43.75, "learning_rate": 4.910732096548973e-05, "loss": 5.8368, "mean_token_accuracy": 0.07707072868943214, "num_tokens": 73615.0, "step": 180 }, { "epoch": 0.05685218432076601, "grad_norm": 32.75, "learning_rate": 4.9057450628366255e-05, "loss": 5.3259, "mean_token_accuracy": 0.1262487217783928, "num_tokens": 78851.0, "step": 190 }, { "epoch": 0.059844404548174746, "grad_norm": 28.125, "learning_rate": 4.900758029124277e-05, "loss": 5.7784, "mean_token_accuracy": 0.08831716366112233, "num_tokens": 82971.0, "step": 200 }, { "epoch": 0.06283662477558348, "grad_norm": 20.75, "learning_rate": 4.895770995411929e-05, "loss": 5.7123, "mean_token_accuracy": 0.0892865601927042, "num_tokens": 87170.0, "step": 210 }, { "epoch": 0.06582884500299221, "grad_norm": 37.5, "learning_rate": 4.8907839616995814e-05, "loss": 5.5482, "mean_token_accuracy": 0.09651436358690262, "num_tokens": 93105.0, "step": 220 }, { "epoch": 0.06882106523040096, "grad_norm": 22.125, "learning_rate": 4.885796927987233e-05, "loss": 5.1698, "mean_token_accuracy": 0.1460722029209137, "num_tokens": 97170.0, "step": 230 }, { "epoch": 0.0718132854578097, "grad_norm": 76.5, "learning_rate": 4.8808098942748855e-05, "loss": 5.244, "mean_token_accuracy": 0.12373836413025856, "num_tokens": 101830.0, "step": 240 }, { "epoch": 0.07480550568521843, "grad_norm": 16.875, "learning_rate": 4.875822860562538e-05, "loss": 5.207, "mean_token_accuracy": 0.12196040600538254, "num_tokens": 106063.0, "step": 250 }, { "epoch": 0.07779772591262717, "grad_norm": 21.25, "learning_rate": 4.87083582685019e-05, "loss": 5.2538, "mean_token_accuracy": 0.12483496218919754, "num_tokens": 110455.0, "step": 260 }, { "epoch": 0.0807899461400359, "grad_norm": 30.25, "learning_rate": 4.8658487931378414e-05, "loss": 5.0607, "mean_token_accuracy": 0.1395721070468426, "num_tokens": 114231.0, "step": 270 }, { "epoch": 0.08378216636744465, "grad_norm": 17.875, "learning_rate": 4.860861759425494e-05, "loss": 5.4699, "mean_token_accuracy": 0.11910050511360168, "num_tokens": 119683.0, "step": 280 }, { "epoch": 0.08677438659485338, "grad_norm": 28.25, "learning_rate": 4.855874725713146e-05, "loss": 5.1832, "mean_token_accuracy": 0.13222006037831308, "num_tokens": 123255.0, "step": 290 }, { "epoch": 0.08976660682226212, "grad_norm": 27.75, "learning_rate": 4.850887692000799e-05, "loss": 4.8555, "mean_token_accuracy": 0.14820684865117073, "num_tokens": 127933.0, "step": 300 }, { "epoch": 0.09275882704967085, "grad_norm": 23.0, "learning_rate": 4.8459006582884504e-05, "loss": 4.9349, "mean_token_accuracy": 0.15446224436163902, "num_tokens": 132134.0, "step": 310 }, { "epoch": 0.09575104727707959, "grad_norm": 26.25, "learning_rate": 4.840913624576102e-05, "loss": 5.1772, "mean_token_accuracy": 0.1377170652151108, "num_tokens": 136569.0, "step": 320 }, { "epoch": 0.09874326750448834, "grad_norm": 22.5, "learning_rate": 4.8359265908637546e-05, "loss": 5.3314, "mean_token_accuracy": 0.12001498714089394, "num_tokens": 140653.0, "step": 330 }, { "epoch": 0.10173548773189707, "grad_norm": 29.125, "learning_rate": 4.830939557151406e-05, "loss": 4.9681, "mean_token_accuracy": 0.135365604609251, "num_tokens": 145278.0, "step": 340 }, { "epoch": 0.1047277079593058, "grad_norm": 23.375, "learning_rate": 4.825952523439059e-05, "loss": 5.0282, "mean_token_accuracy": 0.15209860131144523, "num_tokens": 148698.0, "step": 350 }, { "epoch": 0.10771992818671454, "grad_norm": 21.0, "learning_rate": 4.820965489726711e-05, "loss": 5.1296, "mean_token_accuracy": 0.14033635556697846, "num_tokens": 152228.0, "step": 360 }, { "epoch": 0.11071214841412327, "grad_norm": 24.875, "learning_rate": 4.815978456014363e-05, "loss": 5.1174, "mean_token_accuracy": 0.14664285108447075, "num_tokens": 156663.0, "step": 370 }, { "epoch": 0.11370436864153202, "grad_norm": 21.625, "learning_rate": 4.8109914223020146e-05, "loss": 5.0752, "mean_token_accuracy": 0.16574262082576752, "num_tokens": 161542.0, "step": 380 }, { "epoch": 0.11669658886894076, "grad_norm": 21.625, "learning_rate": 4.806004388589667e-05, "loss": 4.8108, "mean_token_accuracy": 0.18466916903853417, "num_tokens": 165485.0, "step": 390 }, { "epoch": 0.11968880909634949, "grad_norm": 17.875, "learning_rate": 4.801017354877319e-05, "loss": 5.4408, "mean_token_accuracy": 0.12876051440834999, "num_tokens": 171666.0, "step": 400 }, { "epoch": 0.12268102932375823, "grad_norm": 19.125, "learning_rate": 4.796030321164971e-05, "loss": 5.2179, "mean_token_accuracy": 0.1634024627506733, "num_tokens": 177368.0, "step": 410 }, { "epoch": 0.12567324955116696, "grad_norm": 36.5, "learning_rate": 4.7910432874526236e-05, "loss": 5.0537, "mean_token_accuracy": 0.1408073700964451, "num_tokens": 182587.0, "step": 420 }, { "epoch": 0.1286654697785757, "grad_norm": 27.75, "learning_rate": 4.7860562537402754e-05, "loss": 4.9838, "mean_token_accuracy": 0.15434883609414102, "num_tokens": 187825.0, "step": 430 }, { "epoch": 0.13165769000598443, "grad_norm": 32.5, "learning_rate": 4.781069220027928e-05, "loss": 4.8094, "mean_token_accuracy": 0.1907724179327488, "num_tokens": 192757.0, "step": 440 }, { "epoch": 0.13464991023339318, "grad_norm": 30.75, "learning_rate": 4.7760821863155795e-05, "loss": 5.0489, "mean_token_accuracy": 0.14357075691223145, "num_tokens": 196335.0, "step": 450 }, { "epoch": 0.13764213046080193, "grad_norm": 33.75, "learning_rate": 4.771095152603231e-05, "loss": 5.0883, "mean_token_accuracy": 0.17117929756641387, "num_tokens": 200231.0, "step": 460 }, { "epoch": 0.14063435068821065, "grad_norm": 34.75, "learning_rate": 4.7661081188908844e-05, "loss": 4.7789, "mean_token_accuracy": 0.16136289834976197, "num_tokens": 204065.0, "step": 470 }, { "epoch": 0.1436265709156194, "grad_norm": 32.5, "learning_rate": 4.761121085178536e-05, "loss": 4.7431, "mean_token_accuracy": 0.1755128175020218, "num_tokens": 208508.0, "step": 480 }, { "epoch": 0.14661879114302812, "grad_norm": 23.125, "learning_rate": 4.7561340514661885e-05, "loss": 4.8372, "mean_token_accuracy": 0.18989472165703775, "num_tokens": 213285.0, "step": 490 }, { "epoch": 0.14961101137043686, "grad_norm": 30.875, "learning_rate": 4.75114701775384e-05, "loss": 4.6259, "mean_token_accuracy": 0.21336833536624908, "num_tokens": 218031.0, "step": 500 }, { "epoch": 0.1526032315978456, "grad_norm": 40.5, "learning_rate": 4.746159984041492e-05, "loss": 4.9149, "mean_token_accuracy": 0.16736119762063026, "num_tokens": 223481.0, "step": 510 }, { "epoch": 0.15559545182525433, "grad_norm": 20.125, "learning_rate": 4.7411729503291444e-05, "loss": 4.5618, "mean_token_accuracy": 0.20358971655368804, "num_tokens": 227469.0, "step": 520 }, { "epoch": 0.15858767205266308, "grad_norm": 32.0, "learning_rate": 4.736185916616797e-05, "loss": 4.7476, "mean_token_accuracy": 0.19662052094936372, "num_tokens": 232305.0, "step": 530 }, { "epoch": 0.1615798922800718, "grad_norm": 19.875, "learning_rate": 4.7311988829044486e-05, "loss": 4.6212, "mean_token_accuracy": 0.21569150388240815, "num_tokens": 235543.0, "step": 540 }, { "epoch": 0.16457211250748055, "grad_norm": 32.75, "learning_rate": 4.726211849192101e-05, "loss": 4.816, "mean_token_accuracy": 0.1801095575094223, "num_tokens": 239791.0, "step": 550 }, { "epoch": 0.1675643327348893, "grad_norm": 41.75, "learning_rate": 4.721224815479753e-05, "loss": 4.4197, "mean_token_accuracy": 0.2141128771007061, "num_tokens": 244335.0, "step": 560 }, { "epoch": 0.17055655296229802, "grad_norm": 33.0, "learning_rate": 4.7162377817674045e-05, "loss": 4.7224, "mean_token_accuracy": 0.18216264843940735, "num_tokens": 248915.0, "step": 570 }, { "epoch": 0.17354877318970677, "grad_norm": 18.375, "learning_rate": 4.711250748055057e-05, "loss": 4.6028, "mean_token_accuracy": 0.19473867118358612, "num_tokens": 253993.0, "step": 580 }, { "epoch": 0.1765409934171155, "grad_norm": 22.625, "learning_rate": 4.706263714342709e-05, "loss": 4.839, "mean_token_accuracy": 0.1866973988711834, "num_tokens": 258282.0, "step": 590 }, { "epoch": 0.17953321364452424, "grad_norm": 30.875, "learning_rate": 4.701276680630362e-05, "loss": 5.0221, "mean_token_accuracy": 0.16445840001106263, "num_tokens": 264153.0, "step": 600 }, { "epoch": 0.18252543387193298, "grad_norm": 30.25, "learning_rate": 4.6962896469180135e-05, "loss": 4.6525, "mean_token_accuracy": 0.18769880682229995, "num_tokens": 268110.0, "step": 610 }, { "epoch": 0.1855176540993417, "grad_norm": 31.375, "learning_rate": 4.691302613205665e-05, "loss": 4.7869, "mean_token_accuracy": 0.19825029596686364, "num_tokens": 272602.0, "step": 620 }, { "epoch": 0.18850987432675045, "grad_norm": 21.75, "learning_rate": 4.6863155794933176e-05, "loss": 4.8373, "mean_token_accuracy": 0.19757667630910875, "num_tokens": 276801.0, "step": 630 }, { "epoch": 0.19150209455415917, "grad_norm": 29.875, "learning_rate": 4.6813285457809694e-05, "loss": 4.4989, "mean_token_accuracy": 0.2317608743906021, "num_tokens": 281426.0, "step": 640 }, { "epoch": 0.19449431478156792, "grad_norm": 48.25, "learning_rate": 4.676341512068622e-05, "loss": 4.3297, "mean_token_accuracy": 0.2231058359146118, "num_tokens": 285040.0, "step": 650 }, { "epoch": 0.19748653500897667, "grad_norm": 19.75, "learning_rate": 4.671354478356274e-05, "loss": 4.6316, "mean_token_accuracy": 0.19868830889463424, "num_tokens": 288896.0, "step": 660 }, { "epoch": 0.2004787552363854, "grad_norm": 22.375, "learning_rate": 4.666367444643926e-05, "loss": 4.4535, "mean_token_accuracy": 0.21323003247380257, "num_tokens": 294355.0, "step": 670 }, { "epoch": 0.20347097546379414, "grad_norm": 22.25, "learning_rate": 4.661380410931578e-05, "loss": 4.5425, "mean_token_accuracy": 0.21277685910463334, "num_tokens": 297912.0, "step": 680 }, { "epoch": 0.20646319569120286, "grad_norm": 19.25, "learning_rate": 4.65639337721923e-05, "loss": 4.7785, "mean_token_accuracy": 0.21325334906578064, "num_tokens": 302327.0, "step": 690 }, { "epoch": 0.2094554159186116, "grad_norm": 46.0, "learning_rate": 4.6514063435068825e-05, "loss": 4.2218, "mean_token_accuracy": 0.2309760756790638, "num_tokens": 306067.0, "step": 700 }, { "epoch": 0.21244763614602036, "grad_norm": 17.875, "learning_rate": 4.646419309794535e-05, "loss": 4.6053, "mean_token_accuracy": 0.2088135614991188, "num_tokens": 310339.0, "step": 710 }, { "epoch": 0.21543985637342908, "grad_norm": 29.125, "learning_rate": 4.641432276082187e-05, "loss": 4.2837, "mean_token_accuracy": 0.239446784555912, "num_tokens": 315032.0, "step": 720 }, { "epoch": 0.21843207660083783, "grad_norm": 30.375, "learning_rate": 4.6364452423698384e-05, "loss": 4.3222, "mean_token_accuracy": 0.23795566260814666, "num_tokens": 318991.0, "step": 730 }, { "epoch": 0.22142429682824655, "grad_norm": 41.5, "learning_rate": 4.631458208657491e-05, "loss": 4.219, "mean_token_accuracy": 0.2595970630645752, "num_tokens": 321861.0, "step": 740 }, { "epoch": 0.2244165170556553, "grad_norm": 27.0, "learning_rate": 4.6264711749451426e-05, "loss": 4.6013, "mean_token_accuracy": 0.21915716528892518, "num_tokens": 326438.0, "step": 750 }, { "epoch": 0.22740873728306404, "grad_norm": 37.75, "learning_rate": 4.621484141232795e-05, "loss": 4.2543, "mean_token_accuracy": 0.23498550355434417, "num_tokens": 331305.0, "step": 760 }, { "epoch": 0.23040095751047276, "grad_norm": 25.625, "learning_rate": 4.6164971075204474e-05, "loss": 4.595, "mean_token_accuracy": 0.18473858758807182, "num_tokens": 336072.0, "step": 770 }, { "epoch": 0.2333931777378815, "grad_norm": 24.125, "learning_rate": 4.611510073808099e-05, "loss": 4.5353, "mean_token_accuracy": 0.20565996766090394, "num_tokens": 341423.0, "step": 780 }, { "epoch": 0.23638539796529023, "grad_norm": 34.75, "learning_rate": 4.6065230400957516e-05, "loss": 4.3319, "mean_token_accuracy": 0.25166238844394684, "num_tokens": 345172.0, "step": 790 }, { "epoch": 0.23937761819269898, "grad_norm": 34.75, "learning_rate": 4.601536006383403e-05, "loss": 3.9744, "mean_token_accuracy": 0.2654547780752182, "num_tokens": 349952.0, "step": 800 }, { "epoch": 0.24236983842010773, "grad_norm": 27.125, "learning_rate": 4.596548972671055e-05, "loss": 4.1817, "mean_token_accuracy": 0.2300293117761612, "num_tokens": 352347.0, "step": 810 }, { "epoch": 0.24536205864751645, "grad_norm": 24.0, "learning_rate": 4.5915619389587075e-05, "loss": 4.4951, "mean_token_accuracy": 0.20228217914700508, "num_tokens": 356974.0, "step": 820 }, { "epoch": 0.2483542788749252, "grad_norm": 28.25, "learning_rate": 4.58657490524636e-05, "loss": 4.025, "mean_token_accuracy": 0.2504382789134979, "num_tokens": 362547.0, "step": 830 }, { "epoch": 0.2513464991023339, "grad_norm": 30.125, "learning_rate": 4.5815878715340116e-05, "loss": 4.2457, "mean_token_accuracy": 0.24330067336559297, "num_tokens": 366551.0, "step": 840 }, { "epoch": 0.25433871932974267, "grad_norm": 28.75, "learning_rate": 4.576600837821664e-05, "loss": 3.977, "mean_token_accuracy": 0.2565977931022644, "num_tokens": 370062.0, "step": 850 }, { "epoch": 0.2573309395571514, "grad_norm": 23.0, "learning_rate": 4.571613804109316e-05, "loss": 4.3829, "mean_token_accuracy": 0.24670309722423553, "num_tokens": 373755.0, "step": 860 }, { "epoch": 0.26032315978456017, "grad_norm": 26.375, "learning_rate": 4.5666267703969675e-05, "loss": 3.9334, "mean_token_accuracy": 0.284832064807415, "num_tokens": 377036.0, "step": 870 }, { "epoch": 0.26331538001196886, "grad_norm": 25.25, "learning_rate": 4.5616397366846206e-05, "loss": 4.1585, "mean_token_accuracy": 0.2497795984148979, "num_tokens": 381584.0, "step": 880 }, { "epoch": 0.2663076002393776, "grad_norm": 28.375, "learning_rate": 4.5566527029722724e-05, "loss": 4.0316, "mean_token_accuracy": 0.2681065008044243, "num_tokens": 385316.0, "step": 890 }, { "epoch": 0.26929982046678635, "grad_norm": 19.25, "learning_rate": 4.551665669259925e-05, "loss": 4.3058, "mean_token_accuracy": 0.25345864295959475, "num_tokens": 389732.0, "step": 900 }, { "epoch": 0.2722920406941951, "grad_norm": 19.625, "learning_rate": 4.5466786355475765e-05, "loss": 4.5396, "mean_token_accuracy": 0.20461051017045975, "num_tokens": 393695.0, "step": 910 }, { "epoch": 0.27528426092160385, "grad_norm": 36.0, "learning_rate": 4.541691601835228e-05, "loss": 4.3035, "mean_token_accuracy": 0.28128461763262746, "num_tokens": 397792.0, "step": 920 }, { "epoch": 0.27827648114901254, "grad_norm": 31.625, "learning_rate": 4.536704568122881e-05, "loss": 4.368, "mean_token_accuracy": 0.2355639174580574, "num_tokens": 402363.0, "step": 930 }, { "epoch": 0.2812687013764213, "grad_norm": 25.375, "learning_rate": 4.531717534410533e-05, "loss": 4.1856, "mean_token_accuracy": 0.25618700236082076, "num_tokens": 406460.0, "step": 940 }, { "epoch": 0.28426092160383004, "grad_norm": 44.5, "learning_rate": 4.526730500698185e-05, "loss": 4.0633, "mean_token_accuracy": 0.27557541579008105, "num_tokens": 411194.0, "step": 950 }, { "epoch": 0.2872531418312388, "grad_norm": 23.125, "learning_rate": 4.521743466985837e-05, "loss": 3.8936, "mean_token_accuracy": 0.2943033441901207, "num_tokens": 415382.0, "step": 960 }, { "epoch": 0.29024536205864754, "grad_norm": 24.25, "learning_rate": 4.516756433273489e-05, "loss": 4.068, "mean_token_accuracy": 0.2729153633117676, "num_tokens": 419430.0, "step": 970 }, { "epoch": 0.29323758228605623, "grad_norm": 26.375, "learning_rate": 4.511769399561141e-05, "loss": 4.2647, "mean_token_accuracy": 0.237312476336956, "num_tokens": 423990.0, "step": 980 }, { "epoch": 0.296229802513465, "grad_norm": 22.0, "learning_rate": 4.506782365848793e-05, "loss": 3.8042, "mean_token_accuracy": 0.3054521858692169, "num_tokens": 427393.0, "step": 990 }, { "epoch": 0.2992220227408737, "grad_norm": 30.75, "learning_rate": 4.5017953321364456e-05, "loss": 4.6139, "mean_token_accuracy": 0.20095933154225348, "num_tokens": 432707.0, "step": 1000 }, { "epoch": 0.3022142429682825, "grad_norm": 46.5, "learning_rate": 4.496808298424098e-05, "loss": 3.9964, "mean_token_accuracy": 0.2647743716835976, "num_tokens": 437245.0, "step": 1010 }, { "epoch": 0.3052064631956912, "grad_norm": 34.0, "learning_rate": 4.49182126471175e-05, "loss": 3.9297, "mean_token_accuracy": 0.27339537590742113, "num_tokens": 440804.0, "step": 1020 }, { "epoch": 0.3081986834230999, "grad_norm": 27.5, "learning_rate": 4.4868342309994015e-05, "loss": 4.1647, "mean_token_accuracy": 0.28250799477100375, "num_tokens": 444520.0, "step": 1030 }, { "epoch": 0.31119090365050867, "grad_norm": 27.375, "learning_rate": 4.481847197287054e-05, "loss": 4.4008, "mean_token_accuracy": 0.24318356662988663, "num_tokens": 448536.0, "step": 1040 }, { "epoch": 0.3141831238779174, "grad_norm": 28.5, "learning_rate": 4.4768601635747056e-05, "loss": 4.2291, "mean_token_accuracy": 0.23109171837568282, "num_tokens": 452624.0, "step": 1050 }, { "epoch": 0.31717534410532616, "grad_norm": 21.25, "learning_rate": 4.471873129862358e-05, "loss": 4.0299, "mean_token_accuracy": 0.3011272594332695, "num_tokens": 457675.0, "step": 1060 }, { "epoch": 0.3201675643327349, "grad_norm": 19.75, "learning_rate": 4.4668860961500105e-05, "loss": 4.2009, "mean_token_accuracy": 0.24107901751995087, "num_tokens": 462649.0, "step": 1070 }, { "epoch": 0.3231597845601436, "grad_norm": 23.875, "learning_rate": 4.461899062437662e-05, "loss": 4.0349, "mean_token_accuracy": 0.29197061657905576, "num_tokens": 465830.0, "step": 1080 }, { "epoch": 0.32615200478755235, "grad_norm": 28.125, "learning_rate": 4.4569120287253146e-05, "loss": 3.7125, "mean_token_accuracy": 0.30826200395822523, "num_tokens": 470302.0, "step": 1090 }, { "epoch": 0.3291442250149611, "grad_norm": 24.625, "learning_rate": 4.4519249950129664e-05, "loss": 3.986, "mean_token_accuracy": 0.2726521834731102, "num_tokens": 474105.0, "step": 1100 }, { "epoch": 0.33213644524236985, "grad_norm": 28.75, "learning_rate": 4.446937961300619e-05, "loss": 4.4285, "mean_token_accuracy": 0.24719695299863814, "num_tokens": 479640.0, "step": 1110 }, { "epoch": 0.3351286654697786, "grad_norm": 24.5, "learning_rate": 4.441950927588271e-05, "loss": 4.2159, "mean_token_accuracy": 0.2708512395620346, "num_tokens": 484553.0, "step": 1120 }, { "epoch": 0.3381208856971873, "grad_norm": 36.25, "learning_rate": 4.436963893875923e-05, "loss": 4.2127, "mean_token_accuracy": 0.2593187764286995, "num_tokens": 489188.0, "step": 1130 }, { "epoch": 0.34111310592459604, "grad_norm": 27.5, "learning_rate": 4.431976860163575e-05, "loss": 3.6665, "mean_token_accuracy": 0.32944968044757844, "num_tokens": 491886.0, "step": 1140 }, { "epoch": 0.3441053261520048, "grad_norm": 24.0, "learning_rate": 4.426989826451227e-05, "loss": 3.9058, "mean_token_accuracy": 0.2709208592772484, "num_tokens": 497996.0, "step": 1150 }, { "epoch": 0.34709754637941354, "grad_norm": 16.5, "learning_rate": 4.422002792738879e-05, "loss": 3.6553, "mean_token_accuracy": 0.32792728394269943, "num_tokens": 502342.0, "step": 1160 }, { "epoch": 0.3500897666068223, "grad_norm": 23.5, "learning_rate": 4.417015759026531e-05, "loss": 4.2918, "mean_token_accuracy": 0.27489738166332245, "num_tokens": 507455.0, "step": 1170 }, { "epoch": 0.353081986834231, "grad_norm": 25.25, "learning_rate": 4.412028725314184e-05, "loss": 3.9952, "mean_token_accuracy": 0.28054697811603546, "num_tokens": 512936.0, "step": 1180 }, { "epoch": 0.3560742070616397, "grad_norm": 37.25, "learning_rate": 4.4070416916018354e-05, "loss": 3.9521, "mean_token_accuracy": 0.2756631106138229, "num_tokens": 517570.0, "step": 1190 }, { "epoch": 0.3590664272890485, "grad_norm": 23.75, "learning_rate": 4.402054657889488e-05, "loss": 3.8001, "mean_token_accuracy": 0.29615438580513, "num_tokens": 521641.0, "step": 1200 }, { "epoch": 0.3620586475164572, "grad_norm": 23.25, "learning_rate": 4.3970676241771396e-05, "loss": 3.934, "mean_token_accuracy": 0.30610962957143784, "num_tokens": 526036.0, "step": 1210 }, { "epoch": 0.36505086774386597, "grad_norm": 31.875, "learning_rate": 4.392080590464791e-05, "loss": 4.0934, "mean_token_accuracy": 0.27161254435777665, "num_tokens": 530945.0, "step": 1220 }, { "epoch": 0.36804308797127466, "grad_norm": 24.0, "learning_rate": 4.387093556752444e-05, "loss": 3.8777, "mean_token_accuracy": 0.2866261497139931, "num_tokens": 534340.0, "step": 1230 }, { "epoch": 0.3710353081986834, "grad_norm": 28.0, "learning_rate": 4.382106523040096e-05, "loss": 4.1483, "mean_token_accuracy": 0.2668736070394516, "num_tokens": 538433.0, "step": 1240 }, { "epoch": 0.37402752842609216, "grad_norm": 25.5, "learning_rate": 4.377119489327748e-05, "loss": 3.8482, "mean_token_accuracy": 0.27559166103601457, "num_tokens": 542653.0, "step": 1250 }, { "epoch": 0.3770197486535009, "grad_norm": 46.5, "learning_rate": 4.3721324556154e-05, "loss": 4.2931, "mean_token_accuracy": 0.25253031924366953, "num_tokens": 546820.0, "step": 1260 }, { "epoch": 0.38001196888090966, "grad_norm": 28.375, "learning_rate": 4.367145421903052e-05, "loss": 3.8307, "mean_token_accuracy": 0.3006118655204773, "num_tokens": 552012.0, "step": 1270 }, { "epoch": 0.38300418910831835, "grad_norm": 23.875, "learning_rate": 4.362158388190704e-05, "loss": 3.8406, "mean_token_accuracy": 0.31204836815595627, "num_tokens": 556352.0, "step": 1280 }, { "epoch": 0.3859964093357271, "grad_norm": 36.25, "learning_rate": 4.357171354478357e-05, "loss": 3.9484, "mean_token_accuracy": 0.27679614052176477, "num_tokens": 560781.0, "step": 1290 }, { "epoch": 0.38898862956313585, "grad_norm": 26.625, "learning_rate": 4.3521843207660086e-05, "loss": 3.7752, "mean_token_accuracy": 0.30054267644882204, "num_tokens": 563722.0, "step": 1300 }, { "epoch": 0.3919808497905446, "grad_norm": 63.75, "learning_rate": 4.347197287053661e-05, "loss": 3.7764, "mean_token_accuracy": 0.33363881558179853, "num_tokens": 567678.0, "step": 1310 }, { "epoch": 0.39497307001795334, "grad_norm": 23.5, "learning_rate": 4.342210253341313e-05, "loss": 3.8614, "mean_token_accuracy": 0.2934919849038124, "num_tokens": 571872.0, "step": 1320 }, { "epoch": 0.39796529024536204, "grad_norm": 37.5, "learning_rate": 4.3372232196289645e-05, "loss": 3.8984, "mean_token_accuracy": 0.2898759454488754, "num_tokens": 576076.0, "step": 1330 }, { "epoch": 0.4009575104727708, "grad_norm": 34.5, "learning_rate": 4.332236185916617e-05, "loss": 3.8018, "mean_token_accuracy": 0.30701966434717176, "num_tokens": 580652.0, "step": 1340 }, { "epoch": 0.40394973070017953, "grad_norm": 27.0, "learning_rate": 4.3272491522042694e-05, "loss": 4.0836, "mean_token_accuracy": 0.2806298568844795, "num_tokens": 584037.0, "step": 1350 }, { "epoch": 0.4069419509275883, "grad_norm": 26.875, "learning_rate": 4.322262118491921e-05, "loss": 3.3263, "mean_token_accuracy": 0.3484503984451294, "num_tokens": 587904.0, "step": 1360 }, { "epoch": 0.40993417115499703, "grad_norm": 31.625, "learning_rate": 4.3172750847795735e-05, "loss": 3.8748, "mean_token_accuracy": 0.28735994547605515, "num_tokens": 591506.0, "step": 1370 }, { "epoch": 0.4129263913824057, "grad_norm": 39.0, "learning_rate": 4.312288051067225e-05, "loss": 3.925, "mean_token_accuracy": 0.287045781314373, "num_tokens": 595632.0, "step": 1380 }, { "epoch": 0.41591861160981447, "grad_norm": 27.625, "learning_rate": 4.307301017354878e-05, "loss": 4.0082, "mean_token_accuracy": 0.2599979847669601, "num_tokens": 602507.0, "step": 1390 }, { "epoch": 0.4189108318372232, "grad_norm": 47.5, "learning_rate": 4.3023139836425294e-05, "loss": 4.1298, "mean_token_accuracy": 0.26573686897754667, "num_tokens": 606140.0, "step": 1400 }, { "epoch": 0.42190305206463197, "grad_norm": 24.0, "learning_rate": 4.297326949930182e-05, "loss": 3.883, "mean_token_accuracy": 0.2926565006375313, "num_tokens": 609882.0, "step": 1410 }, { "epoch": 0.4248952722920407, "grad_norm": 54.25, "learning_rate": 4.292339916217834e-05, "loss": 3.8369, "mean_token_accuracy": 0.3231379181146622, "num_tokens": 613987.0, "step": 1420 }, { "epoch": 0.4278874925194494, "grad_norm": 39.5, "learning_rate": 4.287352882505486e-05, "loss": 4.1388, "mean_token_accuracy": 0.28088164180517194, "num_tokens": 618639.0, "step": 1430 }, { "epoch": 0.43087971274685816, "grad_norm": 23.5, "learning_rate": 4.282365848793138e-05, "loss": 3.7898, "mean_token_accuracy": 0.3190530717372894, "num_tokens": 623238.0, "step": 1440 }, { "epoch": 0.4338719329742669, "grad_norm": 22.875, "learning_rate": 4.27737881508079e-05, "loss": 3.3278, "mean_token_accuracy": 0.3729147434234619, "num_tokens": 627865.0, "step": 1450 }, { "epoch": 0.43686415320167565, "grad_norm": 29.125, "learning_rate": 4.272391781368442e-05, "loss": 3.6825, "mean_token_accuracy": 0.3166591688990593, "num_tokens": 632426.0, "step": 1460 }, { "epoch": 0.4398563734290844, "grad_norm": 39.75, "learning_rate": 4.267404747656094e-05, "loss": 4.0308, "mean_token_accuracy": 0.2782156467437744, "num_tokens": 636085.0, "step": 1470 }, { "epoch": 0.4428485936564931, "grad_norm": 28.625, "learning_rate": 4.262417713943747e-05, "loss": 3.8042, "mean_token_accuracy": 0.3088426858186722, "num_tokens": 640123.0, "step": 1480 }, { "epoch": 0.44584081388390184, "grad_norm": 21.5, "learning_rate": 4.2574306802313985e-05, "loss": 3.3991, "mean_token_accuracy": 0.3608384057879448, "num_tokens": 644113.0, "step": 1490 }, { "epoch": 0.4488330341113106, "grad_norm": 35.25, "learning_rate": 4.252443646519051e-05, "loss": 3.973, "mean_token_accuracy": 0.28127337992191315, "num_tokens": 647802.0, "step": 1500 }, { "epoch": 0.45182525433871934, "grad_norm": 24.875, "learning_rate": 4.2474566128067026e-05, "loss": 3.7555, "mean_token_accuracy": 0.2975438117980957, "num_tokens": 651745.0, "step": 1510 }, { "epoch": 0.4548174745661281, "grad_norm": 29.875, "learning_rate": 4.242469579094355e-05, "loss": 3.9791, "mean_token_accuracy": 0.30450038611888885, "num_tokens": 655944.0, "step": 1520 }, { "epoch": 0.4578096947935368, "grad_norm": 39.75, "learning_rate": 4.2374825453820075e-05, "loss": 3.6477, "mean_token_accuracy": 0.3407430723309517, "num_tokens": 660460.0, "step": 1530 }, { "epoch": 0.46080191502094553, "grad_norm": 22.125, "learning_rate": 4.232495511669659e-05, "loss": 4.0353, "mean_token_accuracy": 0.3086406052112579, "num_tokens": 664503.0, "step": 1540 }, { "epoch": 0.4637941352483543, "grad_norm": 27.5, "learning_rate": 4.227508477957311e-05, "loss": 3.5857, "mean_token_accuracy": 0.34699532836675645, "num_tokens": 669506.0, "step": 1550 }, { "epoch": 0.466786355475763, "grad_norm": 57.5, "learning_rate": 4.2225214442449634e-05, "loss": 3.5541, "mean_token_accuracy": 0.3328308701515198, "num_tokens": 674486.0, "step": 1560 }, { "epoch": 0.4697785757031718, "grad_norm": 21.125, "learning_rate": 4.217534410532615e-05, "loss": 3.2879, "mean_token_accuracy": 0.3543445348739624, "num_tokens": 678563.0, "step": 1570 }, { "epoch": 0.47277079593058047, "grad_norm": 34.25, "learning_rate": 4.2125473768202675e-05, "loss": 3.6253, "mean_token_accuracy": 0.31252876818180086, "num_tokens": 682718.0, "step": 1580 }, { "epoch": 0.4757630161579892, "grad_norm": 26.875, "learning_rate": 4.20756034310792e-05, "loss": 4.02, "mean_token_accuracy": 0.3078885093331337, "num_tokens": 686239.0, "step": 1590 }, { "epoch": 0.47875523638539796, "grad_norm": 25.125, "learning_rate": 4.202573309395572e-05, "loss": 3.5034, "mean_token_accuracy": 0.3475412830710411, "num_tokens": 690265.0, "step": 1600 }, { "epoch": 0.4817474566128067, "grad_norm": 29.5, "learning_rate": 4.197586275683224e-05, "loss": 3.7491, "mean_token_accuracy": 0.3203135937452316, "num_tokens": 695201.0, "step": 1610 }, { "epoch": 0.48473967684021546, "grad_norm": 21.375, "learning_rate": 4.192599241970876e-05, "loss": 3.9912, "mean_token_accuracy": 0.2843865931034088, "num_tokens": 700042.0, "step": 1620 }, { "epoch": 0.48773189706762415, "grad_norm": 17.875, "learning_rate": 4.1876122082585276e-05, "loss": 4.4067, "mean_token_accuracy": 0.28071754723787307, "num_tokens": 703650.0, "step": 1630 }, { "epoch": 0.4907241172950329, "grad_norm": 18.5, "learning_rate": 4.18262517454618e-05, "loss": 3.5235, "mean_token_accuracy": 0.3489766031503677, "num_tokens": 708375.0, "step": 1640 }, { "epoch": 0.49371633752244165, "grad_norm": 26.375, "learning_rate": 4.1776381408338324e-05, "loss": 3.9863, "mean_token_accuracy": 0.31152606308460234, "num_tokens": 712864.0, "step": 1650 }, { "epoch": 0.4967085577498504, "grad_norm": 25.75, "learning_rate": 4.172651107121484e-05, "loss": 4.003, "mean_token_accuracy": 0.2858720526099205, "num_tokens": 717126.0, "step": 1660 }, { "epoch": 0.49970077797725915, "grad_norm": 45.25, "learning_rate": 4.1676640734091366e-05, "loss": 3.7429, "mean_token_accuracy": 0.3121335655450821, "num_tokens": 720139.0, "step": 1670 }, { "epoch": 0.5026929982046678, "grad_norm": 27.75, "learning_rate": 4.162677039696788e-05, "loss": 3.5136, "mean_token_accuracy": 0.32364853024482726, "num_tokens": 724517.0, "step": 1680 }, { "epoch": 0.5056852184320766, "grad_norm": 29.125, "learning_rate": 4.157690005984441e-05, "loss": 3.6384, "mean_token_accuracy": 0.3176667720079422, "num_tokens": 728613.0, "step": 1690 }, { "epoch": 0.5086774386594853, "grad_norm": 25.75, "learning_rate": 4.1527029722720925e-05, "loss": 3.4635, "mean_token_accuracy": 0.3570543646812439, "num_tokens": 733893.0, "step": 1700 }, { "epoch": 0.5116696588868941, "grad_norm": 35.25, "learning_rate": 4.147715938559745e-05, "loss": 4.0123, "mean_token_accuracy": 0.2949834749102592, "num_tokens": 737230.0, "step": 1710 }, { "epoch": 0.5146618791143028, "grad_norm": 36.5, "learning_rate": 4.142728904847397e-05, "loss": 3.8303, "mean_token_accuracy": 0.3259445518255234, "num_tokens": 740956.0, "step": 1720 }, { "epoch": 0.5176540993417116, "grad_norm": 26.0, "learning_rate": 4.137741871135049e-05, "loss": 3.5221, "mean_token_accuracy": 0.3586596347391605, "num_tokens": 744980.0, "step": 1730 }, { "epoch": 0.5206463195691203, "grad_norm": 42.25, "learning_rate": 4.132754837422701e-05, "loss": 3.945, "mean_token_accuracy": 0.31935873329639436, "num_tokens": 748174.0, "step": 1740 }, { "epoch": 0.5236385397965291, "grad_norm": 43.25, "learning_rate": 4.127767803710353e-05, "loss": 3.9663, "mean_token_accuracy": 0.31343400850892067, "num_tokens": 752551.0, "step": 1750 }, { "epoch": 0.5266307600239377, "grad_norm": 41.75, "learning_rate": 4.1227807699980056e-05, "loss": 3.5433, "mean_token_accuracy": 0.34673927128314974, "num_tokens": 757265.0, "step": 1760 }, { "epoch": 0.5296229802513465, "grad_norm": 22.125, "learning_rate": 4.1177937362856574e-05, "loss": 3.7775, "mean_token_accuracy": 0.3192309245467186, "num_tokens": 762186.0, "step": 1770 }, { "epoch": 0.5326152004787552, "grad_norm": 36.75, "learning_rate": 4.11280670257331e-05, "loss": 3.4575, "mean_token_accuracy": 0.3697234556078911, "num_tokens": 767136.0, "step": 1780 }, { "epoch": 0.535607420706164, "grad_norm": 55.75, "learning_rate": 4.1078196688609615e-05, "loss": 3.9442, "mean_token_accuracy": 0.306137615442276, "num_tokens": 772421.0, "step": 1790 }, { "epoch": 0.5385996409335727, "grad_norm": 30.25, "learning_rate": 4.102832635148614e-05, "loss": 3.4871, "mean_token_accuracy": 0.34432919770479203, "num_tokens": 777103.0, "step": 1800 }, { "epoch": 0.5415918611609815, "grad_norm": 31.375, "learning_rate": 4.097845601436266e-05, "loss": 4.0582, "mean_token_accuracy": 0.2766360715031624, "num_tokens": 781840.0, "step": 1810 }, { "epoch": 0.5445840813883902, "grad_norm": 22.125, "learning_rate": 4.092858567723918e-05, "loss": 3.6937, "mean_token_accuracy": 0.3213631004095078, "num_tokens": 787443.0, "step": 1820 }, { "epoch": 0.547576301615799, "grad_norm": 24.5, "learning_rate": 4.0878715340115705e-05, "loss": 3.2045, "mean_token_accuracy": 0.35475517213344576, "num_tokens": 792923.0, "step": 1830 }, { "epoch": 0.5505685218432077, "grad_norm": 23.625, "learning_rate": 4.082884500299222e-05, "loss": 3.5229, "mean_token_accuracy": 0.34978632628917694, "num_tokens": 797220.0, "step": 1840 }, { "epoch": 0.5535607420706165, "grad_norm": 31.75, "learning_rate": 4.077897466586874e-05, "loss": 3.7298, "mean_token_accuracy": 0.34635201692581175, "num_tokens": 801850.0, "step": 1850 }, { "epoch": 0.5565529622980251, "grad_norm": 38.5, "learning_rate": 4.0729104328745264e-05, "loss": 3.7174, "mean_token_accuracy": 0.30568271577358247, "num_tokens": 807033.0, "step": 1860 }, { "epoch": 0.5595451825254338, "grad_norm": 60.75, "learning_rate": 4.067923399162178e-05, "loss": 3.1007, "mean_token_accuracy": 0.3980128735303879, "num_tokens": 811446.0, "step": 1870 }, { "epoch": 0.5625374027528426, "grad_norm": 35.0, "learning_rate": 4.0629363654498306e-05, "loss": 3.7698, "mean_token_accuracy": 0.3161139450967312, "num_tokens": 817360.0, "step": 1880 }, { "epoch": 0.5655296229802513, "grad_norm": 22.0, "learning_rate": 4.057949331737483e-05, "loss": 3.7394, "mean_token_accuracy": 0.32581870555877684, "num_tokens": 821346.0, "step": 1890 }, { "epoch": 0.5685218432076601, "grad_norm": 40.75, "learning_rate": 4.052962298025135e-05, "loss": 3.2171, "mean_token_accuracy": 0.3967368185520172, "num_tokens": 824850.0, "step": 1900 }, { "epoch": 0.5715140634350688, "grad_norm": 39.75, "learning_rate": 4.047975264312787e-05, "loss": 3.7614, "mean_token_accuracy": 0.334186664968729, "num_tokens": 829049.0, "step": 1910 }, { "epoch": 0.5745062836624776, "grad_norm": 33.0, "learning_rate": 4.042988230600439e-05, "loss": 3.3675, "mean_token_accuracy": 0.3545887053012848, "num_tokens": 832992.0, "step": 1920 }, { "epoch": 0.5774985038898863, "grad_norm": 38.5, "learning_rate": 4.0380011968880906e-05, "loss": 4.1655, "mean_token_accuracy": 0.27635432183742525, "num_tokens": 837562.0, "step": 1930 }, { "epoch": 0.5804907241172951, "grad_norm": 23.625, "learning_rate": 4.033014163175744e-05, "loss": 3.3483, "mean_token_accuracy": 0.3648817211389542, "num_tokens": 842393.0, "step": 1940 }, { "epoch": 0.5834829443447038, "grad_norm": 28.125, "learning_rate": 4.0280271294633955e-05, "loss": 3.4962, "mean_token_accuracy": 0.34077437222003937, "num_tokens": 847798.0, "step": 1950 }, { "epoch": 0.5864751645721125, "grad_norm": 25.5, "learning_rate": 4.023040095751047e-05, "loss": 3.5223, "mean_token_accuracy": 0.3545456677675247, "num_tokens": 852416.0, "step": 1960 }, { "epoch": 0.5894673847995212, "grad_norm": 24.875, "learning_rate": 4.0180530620386996e-05, "loss": 3.2954, "mean_token_accuracy": 0.37952440828084943, "num_tokens": 856872.0, "step": 1970 }, { "epoch": 0.59245960502693, "grad_norm": 20.25, "learning_rate": 4.0130660283263514e-05, "loss": 3.214, "mean_token_accuracy": 0.3997642546892166, "num_tokens": 861808.0, "step": 1980 }, { "epoch": 0.5954518252543387, "grad_norm": 42.75, "learning_rate": 4.008078994614004e-05, "loss": 3.9122, "mean_token_accuracy": 0.31893026232719424, "num_tokens": 866080.0, "step": 1990 }, { "epoch": 0.5984440454817475, "grad_norm": 17.75, "learning_rate": 4.003091960901656e-05, "loss": 3.68, "mean_token_accuracy": 0.3326193943619728, "num_tokens": 870426.0, "step": 2000 }, { "epoch": 0.6014362657091562, "grad_norm": 29.5, "learning_rate": 3.998104927189308e-05, "loss": 4.0992, "mean_token_accuracy": 0.2667170882225037, "num_tokens": 875695.0, "step": 2010 }, { "epoch": 0.604428485936565, "grad_norm": 27.125, "learning_rate": 3.9931178934769604e-05, "loss": 3.7399, "mean_token_accuracy": 0.3234362483024597, "num_tokens": 878821.0, "step": 2020 }, { "epoch": 0.6074207061639737, "grad_norm": 22.625, "learning_rate": 3.988130859764612e-05, "loss": 3.2405, "mean_token_accuracy": 0.38044323176145556, "num_tokens": 882724.0, "step": 2030 }, { "epoch": 0.6104129263913824, "grad_norm": 25.375, "learning_rate": 3.983143826052264e-05, "loss": 3.5705, "mean_token_accuracy": 0.3319473534822464, "num_tokens": 888338.0, "step": 2040 }, { "epoch": 0.6134051466187912, "grad_norm": 23.125, "learning_rate": 3.978156792339916e-05, "loss": 3.475, "mean_token_accuracy": 0.3405942186713219, "num_tokens": 893344.0, "step": 2050 }, { "epoch": 0.6163973668461998, "grad_norm": 31.25, "learning_rate": 3.973169758627569e-05, "loss": 4.0281, "mean_token_accuracy": 0.29738594889640807, "num_tokens": 896526.0, "step": 2060 }, { "epoch": 0.6193895870736086, "grad_norm": 24.5, "learning_rate": 3.9681827249152204e-05, "loss": 3.1748, "mean_token_accuracy": 0.3818254411220551, "num_tokens": 900369.0, "step": 2070 }, { "epoch": 0.6223818073010173, "grad_norm": 36.0, "learning_rate": 3.963195691202873e-05, "loss": 3.3941, "mean_token_accuracy": 0.3613236457109451, "num_tokens": 904935.0, "step": 2080 }, { "epoch": 0.6253740275284261, "grad_norm": 63.25, "learning_rate": 3.9582086574905246e-05, "loss": 3.2628, "mean_token_accuracy": 0.3658029049634933, "num_tokens": 909555.0, "step": 2090 }, { "epoch": 0.6283662477558348, "grad_norm": 26.25, "learning_rate": 3.953221623778177e-05, "loss": 3.2202, "mean_token_accuracy": 0.38725468665361407, "num_tokens": 914294.0, "step": 2100 }, { "epoch": 0.6313584679832436, "grad_norm": 29.25, "learning_rate": 3.948234590065829e-05, "loss": 3.4529, "mean_token_accuracy": 0.3761670857667923, "num_tokens": 919096.0, "step": 2110 }, { "epoch": 0.6343506882106523, "grad_norm": 33.25, "learning_rate": 3.943247556353481e-05, "loss": 3.5437, "mean_token_accuracy": 0.3384189009666443, "num_tokens": 923520.0, "step": 2120 }, { "epoch": 0.6373429084380611, "grad_norm": 20.375, "learning_rate": 3.9382605226411336e-05, "loss": 3.5569, "mean_token_accuracy": 0.34821858182549476, "num_tokens": 928471.0, "step": 2130 }, { "epoch": 0.6403351286654698, "grad_norm": 29.0, "learning_rate": 3.933273488928785e-05, "loss": 3.6471, "mean_token_accuracy": 0.34270432889461516, "num_tokens": 932646.0, "step": 2140 }, { "epoch": 0.6433273488928786, "grad_norm": 30.625, "learning_rate": 3.928286455216437e-05, "loss": 2.7425, "mean_token_accuracy": 0.4409438192844391, "num_tokens": 936601.0, "step": 2150 }, { "epoch": 0.6463195691202872, "grad_norm": 21.875, "learning_rate": 3.9232994215040895e-05, "loss": 3.6186, "mean_token_accuracy": 0.33609611093997954, "num_tokens": 941591.0, "step": 2160 }, { "epoch": 0.649311789347696, "grad_norm": 26.25, "learning_rate": 3.918312387791742e-05, "loss": 2.9869, "mean_token_accuracy": 0.411373496055603, "num_tokens": 945657.0, "step": 2170 }, { "epoch": 0.6523040095751047, "grad_norm": 27.125, "learning_rate": 3.9133253540793936e-05, "loss": 4.0563, "mean_token_accuracy": 0.2994434475898743, "num_tokens": 951468.0, "step": 2180 }, { "epoch": 0.6552962298025135, "grad_norm": 28.375, "learning_rate": 3.908338320367046e-05, "loss": 3.1732, "mean_token_accuracy": 0.38346787095069884, "num_tokens": 954857.0, "step": 2190 }, { "epoch": 0.6582884500299222, "grad_norm": 37.0, "learning_rate": 3.903351286654698e-05, "loss": 3.3023, "mean_token_accuracy": 0.369389246404171, "num_tokens": 959539.0, "step": 2200 }, { "epoch": 0.661280670257331, "grad_norm": 20.875, "learning_rate": 3.89836425294235e-05, "loss": 3.7477, "mean_token_accuracy": 0.3186024323105812, "num_tokens": 964970.0, "step": 2210 }, { "epoch": 0.6642728904847397, "grad_norm": 25.25, "learning_rate": 3.893377219230002e-05, "loss": 3.6275, "mean_token_accuracy": 0.3292903244495392, "num_tokens": 970684.0, "step": 2220 }, { "epoch": 0.6672651107121484, "grad_norm": 27.125, "learning_rate": 3.8883901855176544e-05, "loss": 3.2893, "mean_token_accuracy": 0.3744408220052719, "num_tokens": 975251.0, "step": 2230 }, { "epoch": 0.6702573309395572, "grad_norm": 46.25, "learning_rate": 3.883403151805307e-05, "loss": 3.2116, "mean_token_accuracy": 0.40995657742023467, "num_tokens": 979714.0, "step": 2240 }, { "epoch": 0.6732495511669659, "grad_norm": 33.5, "learning_rate": 3.8784161180929585e-05, "loss": 3.7125, "mean_token_accuracy": 0.33205676227808, "num_tokens": 984789.0, "step": 2250 }, { "epoch": 0.6762417713943746, "grad_norm": 20.125, "learning_rate": 3.87342908438061e-05, "loss": 3.1216, "mean_token_accuracy": 0.39428712129592897, "num_tokens": 988232.0, "step": 2260 }, { "epoch": 0.6792339916217833, "grad_norm": 30.0, "learning_rate": 3.868442050668263e-05, "loss": 3.813, "mean_token_accuracy": 0.31533592492341994, "num_tokens": 994031.0, "step": 2270 }, { "epoch": 0.6822262118491921, "grad_norm": 37.25, "learning_rate": 3.8634550169559144e-05, "loss": 3.7227, "mean_token_accuracy": 0.32704337537288664, "num_tokens": 998035.0, "step": 2280 }, { "epoch": 0.6852184320766008, "grad_norm": 21.125, "learning_rate": 3.858467983243567e-05, "loss": 3.6412, "mean_token_accuracy": 0.34335303902626035, "num_tokens": 1002893.0, "step": 2290 }, { "epoch": 0.6882106523040096, "grad_norm": 28.75, "learning_rate": 3.853480949531219e-05, "loss": 3.3836, "mean_token_accuracy": 0.3560221940279007, "num_tokens": 1008036.0, "step": 2300 }, { "epoch": 0.6912028725314183, "grad_norm": 24.875, "learning_rate": 3.848493915818871e-05, "loss": 3.5464, "mean_token_accuracy": 0.36754521131515505, "num_tokens": 1012629.0, "step": 2310 }, { "epoch": 0.6941950927588271, "grad_norm": 32.25, "learning_rate": 3.8435068821065234e-05, "loss": 3.2238, "mean_token_accuracy": 0.38233456015586853, "num_tokens": 1017049.0, "step": 2320 }, { "epoch": 0.6971873129862358, "grad_norm": 34.25, "learning_rate": 3.838519848394175e-05, "loss": 3.1007, "mean_token_accuracy": 0.39890468716621397, "num_tokens": 1021908.0, "step": 2330 }, { "epoch": 0.7001795332136446, "grad_norm": 23.125, "learning_rate": 3.833532814681827e-05, "loss": 3.6741, "mean_token_accuracy": 0.3266971483826637, "num_tokens": 1027341.0, "step": 2340 }, { "epoch": 0.7031717534410533, "grad_norm": 43.5, "learning_rate": 3.82854578096948e-05, "loss": 3.1599, "mean_token_accuracy": 0.39910418093204497, "num_tokens": 1031079.0, "step": 2350 }, { "epoch": 0.706163973668462, "grad_norm": 27.75, "learning_rate": 3.823558747257132e-05, "loss": 3.055, "mean_token_accuracy": 0.41069386899471283, "num_tokens": 1035408.0, "step": 2360 }, { "epoch": 0.7091561938958707, "grad_norm": 25.5, "learning_rate": 3.8185717135447835e-05, "loss": 2.9248, "mean_token_accuracy": 0.4071251660585403, "num_tokens": 1039977.0, "step": 2370 }, { "epoch": 0.7121484141232794, "grad_norm": 35.75, "learning_rate": 3.813584679832436e-05, "loss": 3.1529, "mean_token_accuracy": 0.4023617923259735, "num_tokens": 1044329.0, "step": 2380 }, { "epoch": 0.7151406343506882, "grad_norm": 26.125, "learning_rate": 3.8085976461200877e-05, "loss": 3.2873, "mean_token_accuracy": 0.38149598240852356, "num_tokens": 1049211.0, "step": 2390 }, { "epoch": 0.718132854578097, "grad_norm": 27.625, "learning_rate": 3.80361061240774e-05, "loss": 2.9516, "mean_token_accuracy": 0.41472441554069517, "num_tokens": 1053849.0, "step": 2400 }, { "epoch": 0.7211250748055057, "grad_norm": 49.75, "learning_rate": 3.7986235786953925e-05, "loss": 3.0006, "mean_token_accuracy": 0.42310560047626494, "num_tokens": 1057578.0, "step": 2410 }, { "epoch": 0.7241172950329144, "grad_norm": 19.375, "learning_rate": 3.793636544983044e-05, "loss": 3.3444, "mean_token_accuracy": 0.38981711566448213, "num_tokens": 1061754.0, "step": 2420 }, { "epoch": 0.7271095152603232, "grad_norm": 31.875, "learning_rate": 3.7886495112706966e-05, "loss": 3.0919, "mean_token_accuracy": 0.3983312755823135, "num_tokens": 1066049.0, "step": 2430 }, { "epoch": 0.7301017354877319, "grad_norm": 39.75, "learning_rate": 3.7836624775583484e-05, "loss": 3.2329, "mean_token_accuracy": 0.36704597175121306, "num_tokens": 1070346.0, "step": 2440 }, { "epoch": 0.7330939557151407, "grad_norm": 20.875, "learning_rate": 3.778675443846e-05, "loss": 2.9394, "mean_token_accuracy": 0.44624441862106323, "num_tokens": 1073148.0, "step": 2450 }, { "epoch": 0.7360861759425493, "grad_norm": 28.875, "learning_rate": 3.7736884101336525e-05, "loss": 2.833, "mean_token_accuracy": 0.43321623355150224, "num_tokens": 1076646.0, "step": 2460 }, { "epoch": 0.7390783961699581, "grad_norm": 28.75, "learning_rate": 3.768701376421305e-05, "loss": 3.0669, "mean_token_accuracy": 0.4107959717512131, "num_tokens": 1080573.0, "step": 2470 }, { "epoch": 0.7420706163973668, "grad_norm": 18.125, "learning_rate": 3.763714342708957e-05, "loss": 2.5892, "mean_token_accuracy": 0.4758401960134506, "num_tokens": 1084686.0, "step": 2480 }, { "epoch": 0.7450628366247756, "grad_norm": 33.25, "learning_rate": 3.758727308996609e-05, "loss": 3.3653, "mean_token_accuracy": 0.37456787526607516, "num_tokens": 1090273.0, "step": 2490 }, { "epoch": 0.7480550568521843, "grad_norm": 24.875, "learning_rate": 3.753740275284261e-05, "loss": 3.0053, "mean_token_accuracy": 0.39563181102275846, "num_tokens": 1095363.0, "step": 2500 }, { "epoch": 0.7510472770795931, "grad_norm": 18.0, "learning_rate": 3.748753241571913e-05, "loss": 3.0128, "mean_token_accuracy": 0.4145811513066292, "num_tokens": 1100624.0, "step": 2510 }, { "epoch": 0.7540394973070018, "grad_norm": 32.5, "learning_rate": 3.743766207859565e-05, "loss": 3.2632, "mean_token_accuracy": 0.39147082567214964, "num_tokens": 1103844.0, "step": 2520 }, { "epoch": 0.7570317175344106, "grad_norm": 21.75, "learning_rate": 3.7387791741472174e-05, "loss": 3.4745, "mean_token_accuracy": 0.3503845572471619, "num_tokens": 1107559.0, "step": 2530 }, { "epoch": 0.7600239377618193, "grad_norm": 24.375, "learning_rate": 3.73379214043487e-05, "loss": 3.342, "mean_token_accuracy": 0.36978781670331956, "num_tokens": 1112153.0, "step": 2540 }, { "epoch": 0.7630161579892281, "grad_norm": 23.375, "learning_rate": 3.7288051067225216e-05, "loss": 3.0539, "mean_token_accuracy": 0.3902305573225021, "num_tokens": 1116743.0, "step": 2550 }, { "epoch": 0.7660083782166367, "grad_norm": 20.25, "learning_rate": 3.7238180730101733e-05, "loss": 2.8592, "mean_token_accuracy": 0.45265459418296816, "num_tokens": 1121737.0, "step": 2560 }, { "epoch": 0.7690005984440454, "grad_norm": 20.625, "learning_rate": 3.718831039297826e-05, "loss": 3.3017, "mean_token_accuracy": 0.38322472497820853, "num_tokens": 1127296.0, "step": 2570 }, { "epoch": 0.7719928186714542, "grad_norm": 25.5, "learning_rate": 3.713844005585478e-05, "loss": 2.7671, "mean_token_accuracy": 0.4317146301269531, "num_tokens": 1131376.0, "step": 2580 }, { "epoch": 0.7749850388988629, "grad_norm": 34.25, "learning_rate": 3.7088569718731306e-05, "loss": 3.0348, "mean_token_accuracy": 0.4127075791358948, "num_tokens": 1135501.0, "step": 2590 }, { "epoch": 0.7779772591262717, "grad_norm": 27.75, "learning_rate": 3.703869938160782e-05, "loss": 3.4994, "mean_token_accuracy": 0.37491829991340636, "num_tokens": 1139282.0, "step": 2600 }, { "epoch": 0.7809694793536804, "grad_norm": 30.875, "learning_rate": 3.698882904448434e-05, "loss": 3.1417, "mean_token_accuracy": 0.40053197741508484, "num_tokens": 1143542.0, "step": 2610 }, { "epoch": 0.7839616995810892, "grad_norm": 34.0, "learning_rate": 3.6938958707360865e-05, "loss": 3.1492, "mean_token_accuracy": 0.3982168883085251, "num_tokens": 1148007.0, "step": 2620 }, { "epoch": 0.7869539198084979, "grad_norm": 37.75, "learning_rate": 3.688908837023738e-05, "loss": 3.0793, "mean_token_accuracy": 0.3932459011673927, "num_tokens": 1151733.0, "step": 2630 }, { "epoch": 0.7899461400359067, "grad_norm": 25.875, "learning_rate": 3.6839218033113907e-05, "loss": 3.3654, "mean_token_accuracy": 0.37628018409013747, "num_tokens": 1155939.0, "step": 2640 }, { "epoch": 0.7929383602633154, "grad_norm": 130.0, "learning_rate": 3.678934769599043e-05, "loss": 2.9446, "mean_token_accuracy": 0.4227631092071533, "num_tokens": 1160646.0, "step": 2650 }, { "epoch": 0.7959305804907241, "grad_norm": 32.5, "learning_rate": 3.673947735886695e-05, "loss": 3.2153, "mean_token_accuracy": 0.3905955284833908, "num_tokens": 1164275.0, "step": 2660 }, { "epoch": 0.7989228007181328, "grad_norm": 24.125, "learning_rate": 3.6689607021743466e-05, "loss": 2.877, "mean_token_accuracy": 0.43215914219617846, "num_tokens": 1167441.0, "step": 2670 }, { "epoch": 0.8019150209455416, "grad_norm": 26.75, "learning_rate": 3.663973668461999e-05, "loss": 3.4118, "mean_token_accuracy": 0.3704531863331795, "num_tokens": 1171450.0, "step": 2680 }, { "epoch": 0.8049072411729503, "grad_norm": 39.25, "learning_rate": 3.658986634749651e-05, "loss": 3.6131, "mean_token_accuracy": 0.3625437140464783, "num_tokens": 1175198.0, "step": 2690 }, { "epoch": 0.8078994614003591, "grad_norm": 34.25, "learning_rate": 3.653999601037303e-05, "loss": 2.7747, "mean_token_accuracy": 0.4520352780818939, "num_tokens": 1178897.0, "step": 2700 }, { "epoch": 0.8108916816277678, "grad_norm": 28.625, "learning_rate": 3.6490125673249555e-05, "loss": 3.4897, "mean_token_accuracy": 0.3455218479037285, "num_tokens": 1183515.0, "step": 2710 }, { "epoch": 0.8138839018551766, "grad_norm": 27.0, "learning_rate": 3.644025533612607e-05, "loss": 3.6577, "mean_token_accuracy": 0.3339434161782265, "num_tokens": 1187074.0, "step": 2720 }, { "epoch": 0.8168761220825853, "grad_norm": 21.5, "learning_rate": 3.63903849990026e-05, "loss": 3.5652, "mean_token_accuracy": 0.34130973666906356, "num_tokens": 1192494.0, "step": 2730 }, { "epoch": 0.8198683423099941, "grad_norm": 28.125, "learning_rate": 3.6340514661879114e-05, "loss": 2.8965, "mean_token_accuracy": 0.42962965965270994, "num_tokens": 1195933.0, "step": 2740 }, { "epoch": 0.8228605625374028, "grad_norm": 32.75, "learning_rate": 3.629064432475563e-05, "loss": 2.9371, "mean_token_accuracy": 0.4335357487201691, "num_tokens": 1200582.0, "step": 2750 }, { "epoch": 0.8258527827648114, "grad_norm": 36.75, "learning_rate": 3.624077398763216e-05, "loss": 3.3401, "mean_token_accuracy": 0.38103923499584197, "num_tokens": 1206875.0, "step": 2760 }, { "epoch": 0.8288450029922202, "grad_norm": 26.0, "learning_rate": 3.619090365050868e-05, "loss": 2.6235, "mean_token_accuracy": 0.4731802463531494, "num_tokens": 1211109.0, "step": 2770 }, { "epoch": 0.8318372232196289, "grad_norm": 36.25, "learning_rate": 3.61410333133852e-05, "loss": 3.0112, "mean_token_accuracy": 0.4310316130518913, "num_tokens": 1215329.0, "step": 2780 }, { "epoch": 0.8348294434470377, "grad_norm": 26.75, "learning_rate": 3.609116297626172e-05, "loss": 2.9925, "mean_token_accuracy": 0.4128095477819443, "num_tokens": 1218332.0, "step": 2790 }, { "epoch": 0.8378216636744464, "grad_norm": 36.5, "learning_rate": 3.604129263913824e-05, "loss": 2.9043, "mean_token_accuracy": 0.4450998976826668, "num_tokens": 1221567.0, "step": 2800 }, { "epoch": 0.8408138839018552, "grad_norm": 23.75, "learning_rate": 3.5991422302014763e-05, "loss": 3.5922, "mean_token_accuracy": 0.3619121313095093, "num_tokens": 1226914.0, "step": 2810 }, { "epoch": 0.8438061041292639, "grad_norm": 31.875, "learning_rate": 3.594155196489129e-05, "loss": 2.9469, "mean_token_accuracy": 0.4257675677537918, "num_tokens": 1231099.0, "step": 2820 }, { "epoch": 0.8467983243566727, "grad_norm": 27.625, "learning_rate": 3.5891681627767805e-05, "loss": 3.5225, "mean_token_accuracy": 0.39053993821144106, "num_tokens": 1234858.0, "step": 2830 }, { "epoch": 0.8497905445840814, "grad_norm": 21.75, "learning_rate": 3.584181129064433e-05, "loss": 2.6881, "mean_token_accuracy": 0.4483701854944229, "num_tokens": 1239315.0, "step": 2840 }, { "epoch": 0.8527827648114902, "grad_norm": 32.5, "learning_rate": 3.5791940953520847e-05, "loss": 3.7535, "mean_token_accuracy": 0.3519248738884926, "num_tokens": 1244480.0, "step": 2850 }, { "epoch": 0.8557749850388988, "grad_norm": 46.25, "learning_rate": 3.5742070616397364e-05, "loss": 3.0964, "mean_token_accuracy": 0.4106554016470909, "num_tokens": 1249519.0, "step": 2860 }, { "epoch": 0.8587672052663076, "grad_norm": 35.25, "learning_rate": 3.569220027927389e-05, "loss": 3.2774, "mean_token_accuracy": 0.3928678810596466, "num_tokens": 1254161.0, "step": 2870 }, { "epoch": 0.8617594254937163, "grad_norm": 26.25, "learning_rate": 3.564232994215041e-05, "loss": 3.1824, "mean_token_accuracy": 0.39329985678195956, "num_tokens": 1257687.0, "step": 2880 }, { "epoch": 0.8647516457211251, "grad_norm": 23.875, "learning_rate": 3.5592459605026937e-05, "loss": 3.3218, "mean_token_accuracy": 0.40274937748908995, "num_tokens": 1261620.0, "step": 2890 }, { "epoch": 0.8677438659485338, "grad_norm": 36.0, "learning_rate": 3.5542589267903454e-05, "loss": 3.1617, "mean_token_accuracy": 0.386403800547123, "num_tokens": 1266555.0, "step": 2900 }, { "epoch": 0.8707360861759426, "grad_norm": 44.25, "learning_rate": 3.549271893077997e-05, "loss": 3.1503, "mean_token_accuracy": 0.40236749649047854, "num_tokens": 1270915.0, "step": 2910 }, { "epoch": 0.8737283064033513, "grad_norm": 31.875, "learning_rate": 3.5442848593656496e-05, "loss": 2.9395, "mean_token_accuracy": 0.4497757613658905, "num_tokens": 1274978.0, "step": 2920 }, { "epoch": 0.87672052663076, "grad_norm": 27.375, "learning_rate": 3.539297825653301e-05, "loss": 3.525, "mean_token_accuracy": 0.35292821377515793, "num_tokens": 1280176.0, "step": 2930 }, { "epoch": 0.8797127468581688, "grad_norm": 27.5, "learning_rate": 3.534310791940954e-05, "loss": 3.0989, "mean_token_accuracy": 0.39904648810625076, "num_tokens": 1284768.0, "step": 2940 }, { "epoch": 0.8827049670855776, "grad_norm": 22.75, "learning_rate": 3.529323758228606e-05, "loss": 2.5238, "mean_token_accuracy": 0.5021311789751053, "num_tokens": 1288030.0, "step": 2950 }, { "epoch": 0.8856971873129862, "grad_norm": 27.0, "learning_rate": 3.524336724516258e-05, "loss": 2.9962, "mean_token_accuracy": 0.4204836696386337, "num_tokens": 1292102.0, "step": 2960 }, { "epoch": 0.8886894075403949, "grad_norm": 31.625, "learning_rate": 3.5193496908039096e-05, "loss": 3.1125, "mean_token_accuracy": 0.4044138967990875, "num_tokens": 1296788.0, "step": 2970 }, { "epoch": 0.8916816277678037, "grad_norm": 23.125, "learning_rate": 3.514362657091562e-05, "loss": 3.3551, "mean_token_accuracy": 0.3792631521821022, "num_tokens": 1301386.0, "step": 2980 }, { "epoch": 0.8946738479952124, "grad_norm": 46.25, "learning_rate": 3.509375623379214e-05, "loss": 3.3053, "mean_token_accuracy": 0.39626367688179015, "num_tokens": 1306384.0, "step": 2990 }, { "epoch": 0.8976660682226212, "grad_norm": 41.0, "learning_rate": 3.504388589666867e-05, "loss": 3.2402, "mean_token_accuracy": 0.3704844668507576, "num_tokens": 1311546.0, "step": 3000 }, { "epoch": 0.9006582884500299, "grad_norm": 32.75, "learning_rate": 3.4994015559545186e-05, "loss": 2.9308, "mean_token_accuracy": 0.43605942130088804, "num_tokens": 1315636.0, "step": 3010 }, { "epoch": 0.9036505086774387, "grad_norm": 22.125, "learning_rate": 3.4944145222421703e-05, "loss": 3.0013, "mean_token_accuracy": 0.4220082849264145, "num_tokens": 1320955.0, "step": 3020 }, { "epoch": 0.9066427289048474, "grad_norm": 26.625, "learning_rate": 3.489427488529823e-05, "loss": 3.1008, "mean_token_accuracy": 0.42272399365901947, "num_tokens": 1324305.0, "step": 3030 }, { "epoch": 0.9096349491322562, "grad_norm": 21.75, "learning_rate": 3.4844404548174745e-05, "loss": 2.5731, "mean_token_accuracy": 0.4852191686630249, "num_tokens": 1327968.0, "step": 3040 }, { "epoch": 0.9126271693596649, "grad_norm": 23.0, "learning_rate": 3.479453421105127e-05, "loss": 2.9561, "mean_token_accuracy": 0.4218144237995148, "num_tokens": 1331005.0, "step": 3050 }, { "epoch": 0.9156193895870736, "grad_norm": 25.125, "learning_rate": 3.4744663873927793e-05, "loss": 3.4899, "mean_token_accuracy": 0.3722094401717186, "num_tokens": 1335408.0, "step": 3060 }, { "epoch": 0.9186116098144823, "grad_norm": 37.5, "learning_rate": 3.469479353680431e-05, "loss": 3.4433, "mean_token_accuracy": 0.358016200363636, "num_tokens": 1339542.0, "step": 3070 }, { "epoch": 0.9216038300418911, "grad_norm": 34.25, "learning_rate": 3.464492319968083e-05, "loss": 2.8409, "mean_token_accuracy": 0.421732684969902, "num_tokens": 1343054.0, "step": 3080 }, { "epoch": 0.9245960502692998, "grad_norm": 24.375, "learning_rate": 3.459505286255735e-05, "loss": 2.6645, "mean_token_accuracy": 0.47050471752882006, "num_tokens": 1348184.0, "step": 3090 }, { "epoch": 0.9275882704967086, "grad_norm": 30.5, "learning_rate": 3.454518252543387e-05, "loss": 2.9127, "mean_token_accuracy": 0.4304572999477386, "num_tokens": 1352551.0, "step": 3100 }, { "epoch": 0.9305804907241173, "grad_norm": 32.0, "learning_rate": 3.4495312188310394e-05, "loss": 3.4285, "mean_token_accuracy": 0.3730440601706505, "num_tokens": 1355944.0, "step": 3110 }, { "epoch": 0.933572710951526, "grad_norm": 23.625, "learning_rate": 3.444544185118692e-05, "loss": 3.1287, "mean_token_accuracy": 0.41855313777923586, "num_tokens": 1359442.0, "step": 3120 }, { "epoch": 0.9365649311789348, "grad_norm": 30.75, "learning_rate": 3.4395571514063436e-05, "loss": 2.7247, "mean_token_accuracy": 0.454880079627037, "num_tokens": 1364185.0, "step": 3130 }, { "epoch": 0.9395571514063435, "grad_norm": 33.0, "learning_rate": 3.434570117693996e-05, "loss": 3.4506, "mean_token_accuracy": 0.3528758123517036, "num_tokens": 1369430.0, "step": 3140 }, { "epoch": 0.9425493716337523, "grad_norm": 32.25, "learning_rate": 3.429583083981648e-05, "loss": 2.5508, "mean_token_accuracy": 0.5001329183578491, "num_tokens": 1374371.0, "step": 3150 }, { "epoch": 0.9455415918611609, "grad_norm": 33.75, "learning_rate": 3.4245960502692995e-05, "loss": 3.2475, "mean_token_accuracy": 0.38751300275325773, "num_tokens": 1379901.0, "step": 3160 }, { "epoch": 0.9485338120885697, "grad_norm": 29.875, "learning_rate": 3.419609016556952e-05, "loss": 3.3707, "mean_token_accuracy": 0.3779307246208191, "num_tokens": 1383740.0, "step": 3170 }, { "epoch": 0.9515260323159784, "grad_norm": 28.375, "learning_rate": 3.414621982844604e-05, "loss": 3.0298, "mean_token_accuracy": 0.4013781577348709, "num_tokens": 1388384.0, "step": 3180 }, { "epoch": 0.9545182525433872, "grad_norm": 31.625, "learning_rate": 3.409634949132257e-05, "loss": 2.6263, "mean_token_accuracy": 0.4786448389291763, "num_tokens": 1392084.0, "step": 3190 }, { "epoch": 0.9575104727707959, "grad_norm": 29.25, "learning_rate": 3.4046479154199085e-05, "loss": 3.0048, "mean_token_accuracy": 0.4301429718732834, "num_tokens": 1397018.0, "step": 3200 }, { "epoch": 0.9605026929982047, "grad_norm": 24.5, "learning_rate": 3.39966088170756e-05, "loss": 2.8107, "mean_token_accuracy": 0.4548857152462006, "num_tokens": 1403173.0, "step": 3210 }, { "epoch": 0.9634949132256134, "grad_norm": 23.625, "learning_rate": 3.3946738479952126e-05, "loss": 2.7295, "mean_token_accuracy": 0.430012971162796, "num_tokens": 1407652.0, "step": 3220 }, { "epoch": 0.9664871334530222, "grad_norm": 21.375, "learning_rate": 3.389686814282865e-05, "loss": 2.7726, "mean_token_accuracy": 0.4541109263896942, "num_tokens": 1412058.0, "step": 3230 }, { "epoch": 0.9694793536804309, "grad_norm": 28.375, "learning_rate": 3.384699780570517e-05, "loss": 3.0506, "mean_token_accuracy": 0.42236119508743286, "num_tokens": 1417616.0, "step": 3240 }, { "epoch": 0.9724715739078397, "grad_norm": 32.75, "learning_rate": 3.379712746858169e-05, "loss": 2.8471, "mean_token_accuracy": 0.4267496675252914, "num_tokens": 1422617.0, "step": 3250 }, { "epoch": 0.9754637941352483, "grad_norm": 31.875, "learning_rate": 3.374725713145821e-05, "loss": 3.2055, "mean_token_accuracy": 0.3901287063956261, "num_tokens": 1429066.0, "step": 3260 }, { "epoch": 0.9784560143626571, "grad_norm": 35.5, "learning_rate": 3.369738679433473e-05, "loss": 2.8184, "mean_token_accuracy": 0.43766441345214846, "num_tokens": 1434896.0, "step": 3270 }, { "epoch": 0.9814482345900658, "grad_norm": 46.5, "learning_rate": 3.364751645721125e-05, "loss": 2.9339, "mean_token_accuracy": 0.4233125030994415, "num_tokens": 1438794.0, "step": 3280 }, { "epoch": 0.9844404548174746, "grad_norm": 34.75, "learning_rate": 3.3597646120087775e-05, "loss": 2.8863, "mean_token_accuracy": 0.45387964099645617, "num_tokens": 1442559.0, "step": 3290 }, { "epoch": 0.9874326750448833, "grad_norm": 24.75, "learning_rate": 3.35477757829643e-05, "loss": 2.6063, "mean_token_accuracy": 0.4726902186870575, "num_tokens": 1447577.0, "step": 3300 }, { "epoch": 0.990424895272292, "grad_norm": 23.75, "learning_rate": 3.349790544584082e-05, "loss": 3.272, "mean_token_accuracy": 0.3847466349601746, "num_tokens": 1451271.0, "step": 3310 }, { "epoch": 0.9934171154997008, "grad_norm": 25.25, "learning_rate": 3.3448035108717334e-05, "loss": 3.2839, "mean_token_accuracy": 0.3943710759282112, "num_tokens": 1455126.0, "step": 3320 }, { "epoch": 0.9964093357271095, "grad_norm": 33.5, "learning_rate": 3.339816477159386e-05, "loss": 2.2571, "mean_token_accuracy": 0.5220259606838227, "num_tokens": 1457975.0, "step": 3330 }, { "epoch": 0.9994015559545183, "grad_norm": 28.625, "learning_rate": 3.3348294434470376e-05, "loss": 3.0659, "mean_token_accuracy": 0.4067483752965927, "num_tokens": 1463249.0, "step": 3340 }, { "epoch": 1.0, "eval_loss": 2.980771780014038, "eval_mean_token_accuracy": 0.42323312556968545, "eval_num_tokens": 1464244.0, "eval_runtime": 28.073, "eval_samples_per_second": 14.89, "eval_steps_per_second": 1.888, "step": 3342 }, { "epoch": 1.002393776181927, "grad_norm": 39.75, "learning_rate": 3.32984240973469e-05, "loss": 3.3756, "mean_token_accuracy": 0.35648047029972074, "num_tokens": 1467102.0, "step": 3350 }, { "epoch": 1.0053859964093357, "grad_norm": 22.625, "learning_rate": 3.3248553760223424e-05, "loss": 2.4231, "mean_token_accuracy": 0.49451322853565216, "num_tokens": 1472322.0, "step": 3360 }, { "epoch": 1.0083782166367445, "grad_norm": 21.875, "learning_rate": 3.319868342309994e-05, "loss": 2.4485, "mean_token_accuracy": 0.49648206532001493, "num_tokens": 1477093.0, "step": 3370 }, { "epoch": 1.0113704368641532, "grad_norm": 27.875, "learning_rate": 3.314881308597646e-05, "loss": 2.6138, "mean_token_accuracy": 0.4813783347606659, "num_tokens": 1480728.0, "step": 3380 }, { "epoch": 1.014362657091562, "grad_norm": 45.75, "learning_rate": 3.309894274885298e-05, "loss": 2.6418, "mean_token_accuracy": 0.4929414004087448, "num_tokens": 1485390.0, "step": 3390 }, { "epoch": 1.0173548773189707, "grad_norm": 27.875, "learning_rate": 3.30490724117295e-05, "loss": 2.4796, "mean_token_accuracy": 0.49346729218959806, "num_tokens": 1491122.0, "step": 3400 }, { "epoch": 1.0203470975463793, "grad_norm": 29.375, "learning_rate": 3.299920207460603e-05, "loss": 2.6773, "mean_token_accuracy": 0.4725665241479874, "num_tokens": 1495421.0, "step": 3410 }, { "epoch": 1.0233393177737882, "grad_norm": 43.5, "learning_rate": 3.294933173748255e-05, "loss": 3.0846, "mean_token_accuracy": 0.419183561205864, "num_tokens": 1499769.0, "step": 3420 }, { "epoch": 1.0263315380011968, "grad_norm": 33.75, "learning_rate": 3.2899461400359066e-05, "loss": 2.6298, "mean_token_accuracy": 0.4962582170963287, "num_tokens": 1504170.0, "step": 3430 }, { "epoch": 1.0293237582286057, "grad_norm": 20.5, "learning_rate": 3.284959106323559e-05, "loss": 2.5207, "mean_token_accuracy": 0.4708354324102402, "num_tokens": 1508664.0, "step": 3440 }, { "epoch": 1.0323159784560143, "grad_norm": 37.25, "learning_rate": 3.279972072611211e-05, "loss": 2.6655, "mean_token_accuracy": 0.4595662534236908, "num_tokens": 1514082.0, "step": 3450 }, { "epoch": 1.0353081986834232, "grad_norm": 24.875, "learning_rate": 3.274985038898863e-05, "loss": 2.8414, "mean_token_accuracy": 0.4344394147396088, "num_tokens": 1519310.0, "step": 3460 }, { "epoch": 1.0383004189108318, "grad_norm": 87.5, "learning_rate": 3.2699980051865156e-05, "loss": 2.5733, "mean_token_accuracy": 0.48141989260911944, "num_tokens": 1522961.0, "step": 3470 }, { "epoch": 1.0412926391382407, "grad_norm": 24.25, "learning_rate": 3.2650109714741674e-05, "loss": 2.6847, "mean_token_accuracy": 0.4702356606721878, "num_tokens": 1527447.0, "step": 3480 }, { "epoch": 1.0442848593656493, "grad_norm": 33.75, "learning_rate": 3.26002393776182e-05, "loss": 2.6967, "mean_token_accuracy": 0.4401080310344696, "num_tokens": 1532143.0, "step": 3490 }, { "epoch": 1.0472770795930582, "grad_norm": 39.75, "learning_rate": 3.2550369040494715e-05, "loss": 2.9048, "mean_token_accuracy": 0.4236243531107903, "num_tokens": 1535966.0, "step": 3500 }, { "epoch": 1.0502692998204668, "grad_norm": 32.25, "learning_rate": 3.250049870337123e-05, "loss": 2.8366, "mean_token_accuracy": 0.4745761215686798, "num_tokens": 1540192.0, "step": 3510 }, { "epoch": 1.0532615200478754, "grad_norm": 40.25, "learning_rate": 3.245062836624776e-05, "loss": 2.5117, "mean_token_accuracy": 0.4899686545133591, "num_tokens": 1544596.0, "step": 3520 }, { "epoch": 1.0562537402752843, "grad_norm": 26.25, "learning_rate": 3.240075802912428e-05, "loss": 2.6037, "mean_token_accuracy": 0.47316262423992156, "num_tokens": 1550210.0, "step": 3530 }, { "epoch": 1.059245960502693, "grad_norm": 22.0, "learning_rate": 3.23508876920008e-05, "loss": 2.8637, "mean_token_accuracy": 0.44064879715442656, "num_tokens": 1554533.0, "step": 3540 }, { "epoch": 1.0622381807301018, "grad_norm": 38.0, "learning_rate": 3.230101735487732e-05, "loss": 2.6214, "mean_token_accuracy": 0.47286906838417053, "num_tokens": 1560617.0, "step": 3550 }, { "epoch": 1.0652304009575104, "grad_norm": 27.875, "learning_rate": 3.225114701775384e-05, "loss": 2.8798, "mean_token_accuracy": 0.4531641513109207, "num_tokens": 1563908.0, "step": 3560 }, { "epoch": 1.0682226211849193, "grad_norm": 37.75, "learning_rate": 3.220127668063036e-05, "loss": 2.8749, "mean_token_accuracy": 0.43050872087478637, "num_tokens": 1568474.0, "step": 3570 }, { "epoch": 1.071214841412328, "grad_norm": 24.875, "learning_rate": 3.215140634350688e-05, "loss": 3.0744, "mean_token_accuracy": 0.4257981926202774, "num_tokens": 1572746.0, "step": 3580 }, { "epoch": 1.0742070616397368, "grad_norm": 40.5, "learning_rate": 3.2101536006383406e-05, "loss": 2.5789, "mean_token_accuracy": 0.47388690114021303, "num_tokens": 1578454.0, "step": 3590 }, { "epoch": 1.0771992818671454, "grad_norm": 29.75, "learning_rate": 3.205166566925993e-05, "loss": 2.9829, "mean_token_accuracy": 0.4279584616422653, "num_tokens": 1583489.0, "step": 3600 }, { "epoch": 1.080191502094554, "grad_norm": 37.0, "learning_rate": 3.200179533213645e-05, "loss": 2.7567, "mean_token_accuracy": 0.4380374073982239, "num_tokens": 1586955.0, "step": 3610 }, { "epoch": 1.083183722321963, "grad_norm": 21.75, "learning_rate": 3.1951924995012965e-05, "loss": 3.1002, "mean_token_accuracy": 0.40744185745716094, "num_tokens": 1592004.0, "step": 3620 }, { "epoch": 1.0861759425493716, "grad_norm": 50.75, "learning_rate": 3.190205465788949e-05, "loss": 2.9584, "mean_token_accuracy": 0.4398718744516373, "num_tokens": 1597537.0, "step": 3630 }, { "epoch": 1.0891681627767804, "grad_norm": 29.125, "learning_rate": 3.185218432076601e-05, "loss": 2.6612, "mean_token_accuracy": 0.4644526898860931, "num_tokens": 1601772.0, "step": 3640 }, { "epoch": 1.092160383004189, "grad_norm": 28.5, "learning_rate": 3.180231398364253e-05, "loss": 2.3019, "mean_token_accuracy": 0.5160307466983796, "num_tokens": 1605652.0, "step": 3650 }, { "epoch": 1.095152603231598, "grad_norm": 29.5, "learning_rate": 3.1752443646519055e-05, "loss": 3.1181, "mean_token_accuracy": 0.3980431482195854, "num_tokens": 1609800.0, "step": 3660 }, { "epoch": 1.0981448234590065, "grad_norm": 39.5, "learning_rate": 3.170257330939557e-05, "loss": 2.772, "mean_token_accuracy": 0.4531924784183502, "num_tokens": 1614274.0, "step": 3670 }, { "epoch": 1.1011370436864154, "grad_norm": 26.125, "learning_rate": 3.165270297227209e-05, "loss": 2.3661, "mean_token_accuracy": 0.514206713438034, "num_tokens": 1617770.0, "step": 3680 }, { "epoch": 1.104129263913824, "grad_norm": 27.875, "learning_rate": 3.1602832635148614e-05, "loss": 3.0596, "mean_token_accuracy": 0.4163840651512146, "num_tokens": 1621234.0, "step": 3690 }, { "epoch": 1.1071214841412327, "grad_norm": 53.5, "learning_rate": 3.155296229802514e-05, "loss": 2.6817, "mean_token_accuracy": 0.47163409888744356, "num_tokens": 1625009.0, "step": 3700 }, { "epoch": 1.1101137043686415, "grad_norm": 24.125, "learning_rate": 3.150309196090166e-05, "loss": 3.2935, "mean_token_accuracy": 0.3971044301986694, "num_tokens": 1630070.0, "step": 3710 }, { "epoch": 1.1131059245960502, "grad_norm": 24.375, "learning_rate": 3.145322162377818e-05, "loss": 2.5974, "mean_token_accuracy": 0.4851890727877617, "num_tokens": 1634810.0, "step": 3720 }, { "epoch": 1.116098144823459, "grad_norm": 40.5, "learning_rate": 3.14033512866547e-05, "loss": 2.5665, "mean_token_accuracy": 0.48123776614665986, "num_tokens": 1638819.0, "step": 3730 }, { "epoch": 1.1190903650508677, "grad_norm": 23.75, "learning_rate": 3.135348094953122e-05, "loss": 2.4931, "mean_token_accuracy": 0.49635804891586305, "num_tokens": 1644484.0, "step": 3740 }, { "epoch": 1.1220825852782765, "grad_norm": 36.25, "learning_rate": 3.130361061240774e-05, "loss": 2.9514, "mean_token_accuracy": 0.4365529015660286, "num_tokens": 1648231.0, "step": 3750 }, { "epoch": 1.1250748055056852, "grad_norm": 22.625, "learning_rate": 3.125374027528426e-05, "loss": 2.9057, "mean_token_accuracy": 0.4231098830699921, "num_tokens": 1653761.0, "step": 3760 }, { "epoch": 1.128067025733094, "grad_norm": 27.0, "learning_rate": 3.120386993816079e-05, "loss": 2.6079, "mean_token_accuracy": 0.5077326536178589, "num_tokens": 1657623.0, "step": 3770 }, { "epoch": 1.1310592459605027, "grad_norm": 24.625, "learning_rate": 3.1153999601037304e-05, "loss": 2.7906, "mean_token_accuracy": 0.4419715031981468, "num_tokens": 1661939.0, "step": 3780 }, { "epoch": 1.1340514661879113, "grad_norm": 28.125, "learning_rate": 3.110412926391383e-05, "loss": 2.4565, "mean_token_accuracy": 0.5014745116233825, "num_tokens": 1666447.0, "step": 3790 }, { "epoch": 1.1370436864153202, "grad_norm": 40.75, "learning_rate": 3.1054258926790346e-05, "loss": 2.3208, "mean_token_accuracy": 0.5103875666856765, "num_tokens": 1671267.0, "step": 3800 }, { "epoch": 1.140035906642729, "grad_norm": 31.375, "learning_rate": 3.100438858966686e-05, "loss": 3.0056, "mean_token_accuracy": 0.4169232577085495, "num_tokens": 1675208.0, "step": 3810 }, { "epoch": 1.1430281268701377, "grad_norm": 25.875, "learning_rate": 3.0954518252543394e-05, "loss": 2.7892, "mean_token_accuracy": 0.4438593596220016, "num_tokens": 1679847.0, "step": 3820 }, { "epoch": 1.1460203470975463, "grad_norm": 26.375, "learning_rate": 3.090464791541991e-05, "loss": 2.8558, "mean_token_accuracy": 0.4273435175418854, "num_tokens": 1684286.0, "step": 3830 }, { "epoch": 1.1490125673249552, "grad_norm": 69.0, "learning_rate": 3.085477757829643e-05, "loss": 2.9511, "mean_token_accuracy": 0.4328667402267456, "num_tokens": 1687901.0, "step": 3840 }, { "epoch": 1.1520047875523638, "grad_norm": 35.0, "learning_rate": 3.080490724117295e-05, "loss": 2.7366, "mean_token_accuracy": 0.45511636435985564, "num_tokens": 1691925.0, "step": 3850 }, { "epoch": 1.1549970077797727, "grad_norm": 32.75, "learning_rate": 3.075503690404947e-05, "loss": 2.3323, "mean_token_accuracy": 0.4906174510717392, "num_tokens": 1696413.0, "step": 3860 }, { "epoch": 1.1579892280071813, "grad_norm": 47.5, "learning_rate": 3.0705166566925995e-05, "loss": 3.2158, "mean_token_accuracy": 0.4027450352907181, "num_tokens": 1700064.0, "step": 3870 }, { "epoch": 1.1609814482345902, "grad_norm": 29.375, "learning_rate": 3.065529622980252e-05, "loss": 2.4488, "mean_token_accuracy": 0.49048165082931516, "num_tokens": 1704477.0, "step": 3880 }, { "epoch": 1.1639736684619988, "grad_norm": 23.125, "learning_rate": 3.0605425892679036e-05, "loss": 2.9511, "mean_token_accuracy": 0.41550472676753997, "num_tokens": 1709655.0, "step": 3890 }, { "epoch": 1.1669658886894076, "grad_norm": 24.5, "learning_rate": 3.055555555555556e-05, "loss": 2.4906, "mean_token_accuracy": 0.4970557034015656, "num_tokens": 1714138.0, "step": 3900 }, { "epoch": 1.1699581089168163, "grad_norm": 28.0, "learning_rate": 3.0505685218432078e-05, "loss": 2.1077, "mean_token_accuracy": 0.5406092315912246, "num_tokens": 1717129.0, "step": 3910 }, { "epoch": 1.172950329144225, "grad_norm": 27.75, "learning_rate": 3.04558148813086e-05, "loss": 2.7482, "mean_token_accuracy": 0.458242666721344, "num_tokens": 1722600.0, "step": 3920 }, { "epoch": 1.1759425493716338, "grad_norm": 24.5, "learning_rate": 3.0405944544185123e-05, "loss": 2.4023, "mean_token_accuracy": 0.49710969924926757, "num_tokens": 1727134.0, "step": 3930 }, { "epoch": 1.1789347695990424, "grad_norm": 35.75, "learning_rate": 3.035607420706164e-05, "loss": 2.7705, "mean_token_accuracy": 0.4731149792671204, "num_tokens": 1731436.0, "step": 3940 }, { "epoch": 1.1819269898264513, "grad_norm": 32.0, "learning_rate": 3.030620386993816e-05, "loss": 2.5216, "mean_token_accuracy": 0.49482154846191406, "num_tokens": 1734311.0, "step": 3950 }, { "epoch": 1.18491921005386, "grad_norm": 21.875, "learning_rate": 3.0256333532814685e-05, "loss": 2.6194, "mean_token_accuracy": 0.4716911166906357, "num_tokens": 1739795.0, "step": 3960 }, { "epoch": 1.1879114302812688, "grad_norm": 41.0, "learning_rate": 3.0206463195691203e-05, "loss": 2.5254, "mean_token_accuracy": 0.49075353145599365, "num_tokens": 1743697.0, "step": 3970 }, { "epoch": 1.1909036505086774, "grad_norm": 23.875, "learning_rate": 3.0156592858567723e-05, "loss": 2.7566, "mean_token_accuracy": 0.46060216426849365, "num_tokens": 1747837.0, "step": 3980 }, { "epoch": 1.1938958707360863, "grad_norm": 23.375, "learning_rate": 3.0106722521444248e-05, "loss": 2.4751, "mean_token_accuracy": 0.488738951086998, "num_tokens": 1752378.0, "step": 3990 }, { "epoch": 1.196888090963495, "grad_norm": 25.25, "learning_rate": 3.0056852184320765e-05, "loss": 2.7018, "mean_token_accuracy": 0.47838138192892077, "num_tokens": 1755508.0, "step": 4000 }, { "epoch": 1.1998803111909035, "grad_norm": 39.5, "learning_rate": 3.000698184719729e-05, "loss": 3.0874, "mean_token_accuracy": 0.42261676490306854, "num_tokens": 1760372.0, "step": 4010 }, { "epoch": 1.2028725314183124, "grad_norm": 33.5, "learning_rate": 2.995711151007381e-05, "loss": 2.0667, "mean_token_accuracy": 0.5681251019239426, "num_tokens": 1764165.0, "step": 4020 }, { "epoch": 1.205864751645721, "grad_norm": 28.625, "learning_rate": 2.9907241172950327e-05, "loss": 2.7279, "mean_token_accuracy": 0.46859243512153625, "num_tokens": 1769407.0, "step": 4030 }, { "epoch": 1.20885697187313, "grad_norm": 28.5, "learning_rate": 2.9857370835826855e-05, "loss": 2.3177, "mean_token_accuracy": 0.5130757451057434, "num_tokens": 1773247.0, "step": 4040 }, { "epoch": 1.2118491921005385, "grad_norm": 18.25, "learning_rate": 2.9807500498703372e-05, "loss": 2.614, "mean_token_accuracy": 0.5009590089321136, "num_tokens": 1778089.0, "step": 4050 }, { "epoch": 1.2148414123279474, "grad_norm": 50.0, "learning_rate": 2.975763016157989e-05, "loss": 2.474, "mean_token_accuracy": 0.5134980499744415, "num_tokens": 1781721.0, "step": 4060 }, { "epoch": 1.217833632555356, "grad_norm": 28.75, "learning_rate": 2.9707759824456417e-05, "loss": 2.7975, "mean_token_accuracy": 0.45258464217185973, "num_tokens": 1786756.0, "step": 4070 }, { "epoch": 1.220825852782765, "grad_norm": 34.5, "learning_rate": 2.9657889487332935e-05, "loss": 2.6269, "mean_token_accuracy": 0.46714275777339936, "num_tokens": 1791148.0, "step": 4080 }, { "epoch": 1.2238180730101735, "grad_norm": 17.25, "learning_rate": 2.960801915020946e-05, "loss": 2.8817, "mean_token_accuracy": 0.4374359607696533, "num_tokens": 1797159.0, "step": 4090 }, { "epoch": 1.2268102932375822, "grad_norm": 27.125, "learning_rate": 2.955814881308598e-05, "loss": 2.6874, "mean_token_accuracy": 0.4609710812568665, "num_tokens": 1802126.0, "step": 4100 }, { "epoch": 1.229802513464991, "grad_norm": 49.5, "learning_rate": 2.9508278475962497e-05, "loss": 2.8059, "mean_token_accuracy": 0.4560705676674843, "num_tokens": 1806970.0, "step": 4110 }, { "epoch": 1.2327947336923997, "grad_norm": 30.25, "learning_rate": 2.945840813883902e-05, "loss": 2.2505, "mean_token_accuracy": 0.5249142855405807, "num_tokens": 1810926.0, "step": 4120 }, { "epoch": 1.2357869539198085, "grad_norm": 39.25, "learning_rate": 2.9408537801715542e-05, "loss": 2.8528, "mean_token_accuracy": 0.4332689017057419, "num_tokens": 1814977.0, "step": 4130 }, { "epoch": 1.2387791741472172, "grad_norm": 26.75, "learning_rate": 2.935866746459206e-05, "loss": 2.786, "mean_token_accuracy": 0.4369647607207298, "num_tokens": 1819584.0, "step": 4140 }, { "epoch": 1.241771394374626, "grad_norm": 31.125, "learning_rate": 2.9308797127468584e-05, "loss": 2.5191, "mean_token_accuracy": 0.4885488286614418, "num_tokens": 1824482.0, "step": 4150 }, { "epoch": 1.2447636146020347, "grad_norm": 34.75, "learning_rate": 2.9258926790345104e-05, "loss": 2.6278, "mean_token_accuracy": 0.47676413506269455, "num_tokens": 1827908.0, "step": 4160 }, { "epoch": 1.2477558348294435, "grad_norm": 39.0, "learning_rate": 2.9209056453221622e-05, "loss": 2.7343, "mean_token_accuracy": 0.4897153973579407, "num_tokens": 1833197.0, "step": 4170 }, { "epoch": 1.2507480550568522, "grad_norm": 31.375, "learning_rate": 2.9159186116098146e-05, "loss": 2.9593, "mean_token_accuracy": 0.42748028337955474, "num_tokens": 1837651.0, "step": 4180 }, { "epoch": 1.2537402752842608, "grad_norm": 41.0, "learning_rate": 2.9109315778974667e-05, "loss": 2.5404, "mean_token_accuracy": 0.47800021767616274, "num_tokens": 1842555.0, "step": 4190 }, { "epoch": 1.2567324955116697, "grad_norm": 35.5, "learning_rate": 2.905944544185119e-05, "loss": 2.5295, "mean_token_accuracy": 0.47885517179965975, "num_tokens": 1846060.0, "step": 4200 }, { "epoch": 1.2597247157390785, "grad_norm": 33.5, "learning_rate": 2.900957510472771e-05, "loss": 2.7566, "mean_token_accuracy": 0.4646937608718872, "num_tokens": 1849781.0, "step": 4210 }, { "epoch": 1.2627169359664872, "grad_norm": 44.25, "learning_rate": 2.895970476760423e-05, "loss": 2.2948, "mean_token_accuracy": 0.5232088088989257, "num_tokens": 1853337.0, "step": 4220 }, { "epoch": 1.2657091561938958, "grad_norm": 26.125, "learning_rate": 2.8909834430480753e-05, "loss": 3.3095, "mean_token_accuracy": 0.38957217186689375, "num_tokens": 1857925.0, "step": 4230 }, { "epoch": 1.2687013764213046, "grad_norm": 33.0, "learning_rate": 2.885996409335727e-05, "loss": 2.3966, "mean_token_accuracy": 0.5166613161563873, "num_tokens": 1861656.0, "step": 4240 }, { "epoch": 1.2716935966487133, "grad_norm": 31.5, "learning_rate": 2.881009375623379e-05, "loss": 2.8243, "mean_token_accuracy": 0.4604656994342804, "num_tokens": 1866691.0, "step": 4250 }, { "epoch": 1.2746858168761221, "grad_norm": 32.75, "learning_rate": 2.8760223419110316e-05, "loss": 2.4936, "mean_token_accuracy": 0.4825888335704803, "num_tokens": 1871883.0, "step": 4260 }, { "epoch": 1.2776780371035308, "grad_norm": 24.5, "learning_rate": 2.8710353081986837e-05, "loss": 3.0457, "mean_token_accuracy": 0.44213865548372266, "num_tokens": 1876630.0, "step": 4270 }, { "epoch": 1.2806702573309394, "grad_norm": 31.875, "learning_rate": 2.8660482744863354e-05, "loss": 2.4731, "mean_token_accuracy": 0.5032434403896332, "num_tokens": 1882195.0, "step": 4280 }, { "epoch": 1.2836624775583483, "grad_norm": 45.75, "learning_rate": 2.8610612407739878e-05, "loss": 2.5616, "mean_token_accuracy": 0.4920288294553757, "num_tokens": 1886803.0, "step": 4290 }, { "epoch": 1.2866546977857571, "grad_norm": 29.375, "learning_rate": 2.85607420706164e-05, "loss": 2.7951, "mean_token_accuracy": 0.4569530040025711, "num_tokens": 1891033.0, "step": 4300 }, { "epoch": 1.2896469180131658, "grad_norm": 24.25, "learning_rate": 2.8510871733492923e-05, "loss": 2.6682, "mean_token_accuracy": 0.48598000407218933, "num_tokens": 1896433.0, "step": 4310 }, { "epoch": 1.2926391382405744, "grad_norm": 19.0, "learning_rate": 2.846100139636944e-05, "loss": 2.9857, "mean_token_accuracy": 0.3988271489739418, "num_tokens": 1901571.0, "step": 4320 }, { "epoch": 1.2956313584679833, "grad_norm": 30.25, "learning_rate": 2.841113105924596e-05, "loss": 1.8598, "mean_token_accuracy": 0.585317638516426, "num_tokens": 1906215.0, "step": 4330 }, { "epoch": 1.298623578695392, "grad_norm": 39.5, "learning_rate": 2.8361260722122485e-05, "loss": 2.6775, "mean_token_accuracy": 0.47187069356441497, "num_tokens": 1909121.0, "step": 4340 }, { "epoch": 1.3016157989228008, "grad_norm": 30.875, "learning_rate": 2.8311390384999003e-05, "loss": 3.2452, "mean_token_accuracy": 0.3938755810260773, "num_tokens": 1913371.0, "step": 4350 }, { "epoch": 1.3046080191502094, "grad_norm": 46.0, "learning_rate": 2.8261520047875524e-05, "loss": 2.6618, "mean_token_accuracy": 0.4829663932323456, "num_tokens": 1917072.0, "step": 4360 }, { "epoch": 1.3076002393776183, "grad_norm": 42.25, "learning_rate": 2.8211649710752048e-05, "loss": 2.636, "mean_token_accuracy": 0.48219700157642365, "num_tokens": 1919981.0, "step": 4370 }, { "epoch": 1.310592459605027, "grad_norm": 19.875, "learning_rate": 2.8161779373628565e-05, "loss": 2.2672, "mean_token_accuracy": 0.5360256373882294, "num_tokens": 1923903.0, "step": 4380 }, { "epoch": 1.3135846798324358, "grad_norm": 27.375, "learning_rate": 2.811190903650509e-05, "loss": 2.9116, "mean_token_accuracy": 0.4232723444700241, "num_tokens": 1927175.0, "step": 4390 }, { "epoch": 1.3165769000598444, "grad_norm": 24.875, "learning_rate": 2.806203869938161e-05, "loss": 2.5052, "mean_token_accuracy": 0.49001633524894717, "num_tokens": 1930883.0, "step": 4400 }, { "epoch": 1.319569120287253, "grad_norm": 29.875, "learning_rate": 2.8012168362258128e-05, "loss": 2.9911, "mean_token_accuracy": 0.44069938361644745, "num_tokens": 1934961.0, "step": 4410 }, { "epoch": 1.322561340514662, "grad_norm": 31.625, "learning_rate": 2.7962298025134652e-05, "loss": 2.9474, "mean_token_accuracy": 0.43861653506755827, "num_tokens": 1940103.0, "step": 4420 }, { "epoch": 1.3255535607420705, "grad_norm": 32.5, "learning_rate": 2.7912427688011173e-05, "loss": 2.6026, "mean_token_accuracy": 0.48938514590263366, "num_tokens": 1944793.0, "step": 4430 }, { "epoch": 1.3285457809694794, "grad_norm": 35.0, "learning_rate": 2.786255735088769e-05, "loss": 2.2731, "mean_token_accuracy": 0.517932477593422, "num_tokens": 1948864.0, "step": 4440 }, { "epoch": 1.331538001196888, "grad_norm": 28.875, "learning_rate": 2.7812687013764218e-05, "loss": 2.3692, "mean_token_accuracy": 0.5257283002138138, "num_tokens": 1951872.0, "step": 4450 }, { "epoch": 1.334530221424297, "grad_norm": 21.375, "learning_rate": 2.7762816676640735e-05, "loss": 2.2699, "mean_token_accuracy": 0.5130042910575867, "num_tokens": 1956690.0, "step": 4460 }, { "epoch": 1.3375224416517055, "grad_norm": 41.0, "learning_rate": 2.7712946339517252e-05, "loss": 2.6093, "mean_token_accuracy": 0.46930437386035917, "num_tokens": 1960310.0, "step": 4470 }, { "epoch": 1.3405146618791144, "grad_norm": 34.0, "learning_rate": 2.766307600239378e-05, "loss": 3.125, "mean_token_accuracy": 0.3979645177721977, "num_tokens": 1964518.0, "step": 4480 }, { "epoch": 1.343506882106523, "grad_norm": 41.5, "learning_rate": 2.7613205665270297e-05, "loss": 2.4804, "mean_token_accuracy": 0.5112906128168107, "num_tokens": 1967926.0, "step": 4490 }, { "epoch": 1.3464991023339317, "grad_norm": 19.25, "learning_rate": 2.756333532814682e-05, "loss": 2.3555, "mean_token_accuracy": 0.5107169628143311, "num_tokens": 1971907.0, "step": 4500 }, { "epoch": 1.3494913225613405, "grad_norm": 35.75, "learning_rate": 2.7513464991023342e-05, "loss": 2.7525, "mean_token_accuracy": 0.46286738514900205, "num_tokens": 1976005.0, "step": 4510 }, { "epoch": 1.3524835427887494, "grad_norm": 24.25, "learning_rate": 2.746359465389986e-05, "loss": 3.0099, "mean_token_accuracy": 0.4340349406003952, "num_tokens": 1980363.0, "step": 4520 }, { "epoch": 1.355475763016158, "grad_norm": 30.125, "learning_rate": 2.7413724316776384e-05, "loss": 2.4456, "mean_token_accuracy": 0.495415124297142, "num_tokens": 1984602.0, "step": 4530 }, { "epoch": 1.3584679832435667, "grad_norm": 18.75, "learning_rate": 2.7363853979652905e-05, "loss": 2.4915, "mean_token_accuracy": 0.49877011477947236, "num_tokens": 1988696.0, "step": 4540 }, { "epoch": 1.3614602034709755, "grad_norm": 35.5, "learning_rate": 2.7313983642529422e-05, "loss": 2.5525, "mean_token_accuracy": 0.46752736568450926, "num_tokens": 1993603.0, "step": 4550 }, { "epoch": 1.3644524236983842, "grad_norm": 26.25, "learning_rate": 2.7264113305405946e-05, "loss": 2.8927, "mean_token_accuracy": 0.4371457099914551, "num_tokens": 1998477.0, "step": 4560 }, { "epoch": 1.367444643925793, "grad_norm": 53.25, "learning_rate": 2.7214242968282467e-05, "loss": 2.874, "mean_token_accuracy": 0.4590120866894722, "num_tokens": 2002477.0, "step": 4570 }, { "epoch": 1.3704368641532017, "grad_norm": 37.75, "learning_rate": 2.7164372631158985e-05, "loss": 2.5264, "mean_token_accuracy": 0.4863176017999649, "num_tokens": 2006648.0, "step": 4580 }, { "epoch": 1.3734290843806103, "grad_norm": 21.625, "learning_rate": 2.711450229403551e-05, "loss": 2.5302, "mean_token_accuracy": 0.49698867201805114, "num_tokens": 2012013.0, "step": 4590 }, { "epoch": 1.3764213046080191, "grad_norm": 48.5, "learning_rate": 2.706463195691203e-05, "loss": 2.7545, "mean_token_accuracy": 0.4686457276344299, "num_tokens": 2016062.0, "step": 4600 }, { "epoch": 1.379413524835428, "grad_norm": 27.625, "learning_rate": 2.7014761619788554e-05, "loss": 2.7296, "mean_token_accuracy": 0.48929981589317323, "num_tokens": 2020561.0, "step": 4610 }, { "epoch": 1.3824057450628366, "grad_norm": 94.0, "learning_rate": 2.696489128266507e-05, "loss": 2.2092, "mean_token_accuracy": 0.5368007481098175, "num_tokens": 2023737.0, "step": 4620 }, { "epoch": 1.3853979652902453, "grad_norm": 27.375, "learning_rate": 2.6915020945541592e-05, "loss": 2.3728, "mean_token_accuracy": 0.5155722945928574, "num_tokens": 2028489.0, "step": 4630 }, { "epoch": 1.3883901855176541, "grad_norm": 48.75, "learning_rate": 2.6865150608418116e-05, "loss": 2.6691, "mean_token_accuracy": 0.463055419921875, "num_tokens": 2033078.0, "step": 4640 }, { "epoch": 1.3913824057450628, "grad_norm": 50.75, "learning_rate": 2.6815280271294633e-05, "loss": 2.1985, "mean_token_accuracy": 0.5305429100990295, "num_tokens": 2038096.0, "step": 4650 }, { "epoch": 1.3943746259724716, "grad_norm": 58.25, "learning_rate": 2.6765409934171154e-05, "loss": 2.6201, "mean_token_accuracy": 0.4828624278306961, "num_tokens": 2041923.0, "step": 4660 }, { "epoch": 1.3973668461998803, "grad_norm": 23.625, "learning_rate": 2.671553959704768e-05, "loss": 2.5054, "mean_token_accuracy": 0.5020751714706421, "num_tokens": 2047556.0, "step": 4670 }, { "epoch": 1.400359066427289, "grad_norm": 22.625, "learning_rate": 2.6665669259924196e-05, "loss": 2.5951, "mean_token_accuracy": 0.5069888651371002, "num_tokens": 2052067.0, "step": 4680 }, { "epoch": 1.4033512866546978, "grad_norm": 29.125, "learning_rate": 2.6615798922800723e-05, "loss": 2.668, "mean_token_accuracy": 0.4788062393665314, "num_tokens": 2056698.0, "step": 4690 }, { "epoch": 1.4063435068821066, "grad_norm": 30.75, "learning_rate": 2.656592858567724e-05, "loss": 2.4209, "mean_token_accuracy": 0.5212773263454438, "num_tokens": 2060376.0, "step": 4700 }, { "epoch": 1.4093357271095153, "grad_norm": 41.75, "learning_rate": 2.651605824855376e-05, "loss": 2.0899, "mean_token_accuracy": 0.580978724360466, "num_tokens": 2063694.0, "step": 4710 }, { "epoch": 1.412327947336924, "grad_norm": 36.75, "learning_rate": 2.6466187911430286e-05, "loss": 1.9738, "mean_token_accuracy": 0.5679178059101104, "num_tokens": 2068346.0, "step": 4720 }, { "epoch": 1.4153201675643328, "grad_norm": 34.0, "learning_rate": 2.6416317574306803e-05, "loss": 2.7249, "mean_token_accuracy": 0.4764808028936386, "num_tokens": 2071494.0, "step": 4730 }, { "epoch": 1.4183123877917414, "grad_norm": 27.125, "learning_rate": 2.6366447237183324e-05, "loss": 2.6907, "mean_token_accuracy": 0.46054676324129107, "num_tokens": 2076057.0, "step": 4740 }, { "epoch": 1.4213046080191503, "grad_norm": 49.25, "learning_rate": 2.6316576900059848e-05, "loss": 2.7199, "mean_token_accuracy": 0.4358877420425415, "num_tokens": 2080486.0, "step": 4750 }, { "epoch": 1.424296828246559, "grad_norm": 43.75, "learning_rate": 2.6266706562936366e-05, "loss": 2.1997, "mean_token_accuracy": 0.5404698431491852, "num_tokens": 2083740.0, "step": 4760 }, { "epoch": 1.4272890484739678, "grad_norm": 23.125, "learning_rate": 2.6216836225812886e-05, "loss": 2.9674, "mean_token_accuracy": 0.45190505385398866, "num_tokens": 2088920.0, "step": 4770 }, { "epoch": 1.4302812687013764, "grad_norm": 21.0, "learning_rate": 2.616696588868941e-05, "loss": 1.822, "mean_token_accuracy": 0.6119337141513824, "num_tokens": 2092746.0, "step": 4780 }, { "epoch": 1.4332734889287853, "grad_norm": 40.5, "learning_rate": 2.6117095551565928e-05, "loss": 2.81, "mean_token_accuracy": 0.4584572911262512, "num_tokens": 2096346.0, "step": 4790 }, { "epoch": 1.436265709156194, "grad_norm": 28.125, "learning_rate": 2.6067225214442452e-05, "loss": 2.5106, "mean_token_accuracy": 0.48858973383903503, "num_tokens": 2100465.0, "step": 4800 }, { "epoch": 1.4392579293836025, "grad_norm": 32.75, "learning_rate": 2.6017354877318973e-05, "loss": 2.519, "mean_token_accuracy": 0.4892947614192963, "num_tokens": 2105797.0, "step": 4810 }, { "epoch": 1.4422501496110114, "grad_norm": 34.0, "learning_rate": 2.596748454019549e-05, "loss": 2.2762, "mean_token_accuracy": 0.5497841536998749, "num_tokens": 2109547.0, "step": 4820 }, { "epoch": 1.44524236983842, "grad_norm": 28.0, "learning_rate": 2.5917614203072015e-05, "loss": 2.7829, "mean_token_accuracy": 0.44075029492378237, "num_tokens": 2114477.0, "step": 4830 }, { "epoch": 1.4482345900658289, "grad_norm": 32.75, "learning_rate": 2.5867743865948535e-05, "loss": 2.6167, "mean_token_accuracy": 0.49549303352832796, "num_tokens": 2119250.0, "step": 4840 }, { "epoch": 1.4512268102932375, "grad_norm": 27.375, "learning_rate": 2.5817873528825053e-05, "loss": 2.197, "mean_token_accuracy": 0.5380541890859604, "num_tokens": 2123856.0, "step": 4850 }, { "epoch": 1.4542190305206464, "grad_norm": 26.875, "learning_rate": 2.5768003191701577e-05, "loss": 2.7312, "mean_token_accuracy": 0.49555808305740356, "num_tokens": 2128744.0, "step": 4860 }, { "epoch": 1.457211250748055, "grad_norm": 23.375, "learning_rate": 2.5718132854578098e-05, "loss": 2.5342, "mean_token_accuracy": 0.48873263597488403, "num_tokens": 2134383.0, "step": 4870 }, { "epoch": 1.4602034709754639, "grad_norm": 36.5, "learning_rate": 2.5668262517454615e-05, "loss": 2.3537, "mean_token_accuracy": 0.5590060204267502, "num_tokens": 2138661.0, "step": 4880 }, { "epoch": 1.4631956912028725, "grad_norm": 44.25, "learning_rate": 2.5618392180331143e-05, "loss": 2.9716, "mean_token_accuracy": 0.4578070640563965, "num_tokens": 2143943.0, "step": 4890 }, { "epoch": 1.4661879114302812, "grad_norm": 18.25, "learning_rate": 2.556852184320766e-05, "loss": 2.4542, "mean_token_accuracy": 0.49858992397785185, "num_tokens": 2150346.0, "step": 4900 }, { "epoch": 1.46918013165769, "grad_norm": 43.5, "learning_rate": 2.5518651506084184e-05, "loss": 2.0952, "mean_token_accuracy": 0.5627575278282165, "num_tokens": 2155714.0, "step": 4910 }, { "epoch": 1.4721723518850989, "grad_norm": 24.375, "learning_rate": 2.5468781168960705e-05, "loss": 2.5803, "mean_token_accuracy": 0.4823772877454758, "num_tokens": 2160085.0, "step": 4920 }, { "epoch": 1.4751645721125075, "grad_norm": 21.125, "learning_rate": 2.5418910831837222e-05, "loss": 2.1136, "mean_token_accuracy": 0.5596380650997161, "num_tokens": 2164085.0, "step": 4930 }, { "epoch": 1.4781567923399161, "grad_norm": 29.625, "learning_rate": 2.5369040494713747e-05, "loss": 2.6144, "mean_token_accuracy": 0.46260478496551516, "num_tokens": 2167762.0, "step": 4940 }, { "epoch": 1.481149012567325, "grad_norm": 22.0, "learning_rate": 2.5319170157590267e-05, "loss": 2.2213, "mean_token_accuracy": 0.5482838392257691, "num_tokens": 2171910.0, "step": 4950 }, { "epoch": 1.4841412327947336, "grad_norm": 42.75, "learning_rate": 2.5269299820466785e-05, "loss": 2.7502, "mean_token_accuracy": 0.46187237948179244, "num_tokens": 2176244.0, "step": 4960 }, { "epoch": 1.4871334530221425, "grad_norm": 30.25, "learning_rate": 2.521942948334331e-05, "loss": 2.7146, "mean_token_accuracy": 0.48688181936740876, "num_tokens": 2181027.0, "step": 4970 }, { "epoch": 1.4901256732495511, "grad_norm": 29.25, "learning_rate": 2.516955914621983e-05, "loss": 2.1965, "mean_token_accuracy": 0.5618243396282196, "num_tokens": 2185783.0, "step": 4980 }, { "epoch": 1.4931178934769598, "grad_norm": 38.0, "learning_rate": 2.5119688809096354e-05, "loss": 2.5971, "mean_token_accuracy": 0.496805140376091, "num_tokens": 2190346.0, "step": 4990 }, { "epoch": 1.4961101137043686, "grad_norm": 48.5, "learning_rate": 2.506981847197287e-05, "loss": 2.5287, "mean_token_accuracy": 0.49744627475738523, "num_tokens": 2194540.0, "step": 5000 }, { "epoch": 1.4991023339317775, "grad_norm": 29.125, "learning_rate": 2.5019948134849392e-05, "loss": 2.4346, "mean_token_accuracy": 0.5162340372800827, "num_tokens": 2198738.0, "step": 5010 }, { "epoch": 1.5020945541591861, "grad_norm": 28.875, "learning_rate": 2.4970077797725913e-05, "loss": 2.4267, "mean_token_accuracy": 0.5139214277267456, "num_tokens": 2203672.0, "step": 5020 }, { "epoch": 1.5050867743865948, "grad_norm": 37.0, "learning_rate": 2.4920207460602434e-05, "loss": 2.1187, "mean_token_accuracy": 0.5651913076639176, "num_tokens": 2208322.0, "step": 5030 }, { "epoch": 1.5080789946140036, "grad_norm": 39.0, "learning_rate": 2.4870337123478958e-05, "loss": 2.0766, "mean_token_accuracy": 0.5604483515024186, "num_tokens": 2212748.0, "step": 5040 }, { "epoch": 1.5110712148414125, "grad_norm": 39.25, "learning_rate": 2.4820466786355475e-05, "loss": 2.437, "mean_token_accuracy": 0.5023006498813629, "num_tokens": 2216429.0, "step": 5050 }, { "epoch": 1.5140634350688211, "grad_norm": 22.375, "learning_rate": 2.4770596449231996e-05, "loss": 2.5274, "mean_token_accuracy": 0.5053725391626358, "num_tokens": 2220799.0, "step": 5060 }, { "epoch": 1.5170556552962298, "grad_norm": 32.0, "learning_rate": 2.472072611210852e-05, "loss": 2.1191, "mean_token_accuracy": 0.5730376541614532, "num_tokens": 2225612.0, "step": 5070 }, { "epoch": 1.5200478755236384, "grad_norm": 38.5, "learning_rate": 2.467085577498504e-05, "loss": 2.8519, "mean_token_accuracy": 0.4593438282608986, "num_tokens": 2231107.0, "step": 5080 }, { "epoch": 1.5230400957510473, "grad_norm": 29.125, "learning_rate": 2.462098543786156e-05, "loss": 2.1636, "mean_token_accuracy": 0.5632421642541885, "num_tokens": 2235169.0, "step": 5090 }, { "epoch": 1.5260323159784561, "grad_norm": 27.25, "learning_rate": 2.4571115100738083e-05, "loss": 2.2962, "mean_token_accuracy": 0.5278664976358414, "num_tokens": 2239382.0, "step": 5100 }, { "epoch": 1.5290245362058648, "grad_norm": 35.5, "learning_rate": 2.4521244763614604e-05, "loss": 2.8431, "mean_token_accuracy": 0.4671991243958473, "num_tokens": 2244088.0, "step": 5110 }, { "epoch": 1.5320167564332734, "grad_norm": 40.25, "learning_rate": 2.4471374426491124e-05, "loss": 2.3703, "mean_token_accuracy": 0.5311822235584259, "num_tokens": 2249483.0, "step": 5120 }, { "epoch": 1.5350089766606823, "grad_norm": 18.875, "learning_rate": 2.4421504089367645e-05, "loss": 2.1462, "mean_token_accuracy": 0.5573323130607605, "num_tokens": 2253313.0, "step": 5130 }, { "epoch": 1.5380011968880911, "grad_norm": 48.0, "learning_rate": 2.4371633752244166e-05, "loss": 2.4867, "mean_token_accuracy": 0.5295968234539032, "num_tokens": 2258004.0, "step": 5140 }, { "epoch": 1.5409934171154998, "grad_norm": 36.25, "learning_rate": 2.4321763415120687e-05, "loss": 2.4837, "mean_token_accuracy": 0.5265371382236481, "num_tokens": 2262216.0, "step": 5150 }, { "epoch": 1.5439856373429084, "grad_norm": 45.25, "learning_rate": 2.427189307799721e-05, "loss": 2.9704, "mean_token_accuracy": 0.43561657071113585, "num_tokens": 2266858.0, "step": 5160 }, { "epoch": 1.546977857570317, "grad_norm": 40.25, "learning_rate": 2.4222022740873728e-05, "loss": 2.759, "mean_token_accuracy": 0.47565021812915803, "num_tokens": 2270709.0, "step": 5170 }, { "epoch": 1.5499700777977259, "grad_norm": 34.25, "learning_rate": 2.417215240375025e-05, "loss": 2.1739, "mean_token_accuracy": 0.5407566428184509, "num_tokens": 2274979.0, "step": 5180 }, { "epoch": 1.5529622980251347, "grad_norm": 28.625, "learning_rate": 2.4122282066626773e-05, "loss": 2.789, "mean_token_accuracy": 0.4501289531588554, "num_tokens": 2278606.0, "step": 5190 }, { "epoch": 1.5559545182525434, "grad_norm": 20.75, "learning_rate": 2.407241172950329e-05, "loss": 2.3088, "mean_token_accuracy": 0.5458819806575775, "num_tokens": 2282960.0, "step": 5200 }, { "epoch": 1.558946738479952, "grad_norm": 22.625, "learning_rate": 2.4022541392379815e-05, "loss": 2.5838, "mean_token_accuracy": 0.4927642345428467, "num_tokens": 2287682.0, "step": 5210 }, { "epoch": 1.5619389587073609, "grad_norm": 27.625, "learning_rate": 2.3972671055256336e-05, "loss": 2.838, "mean_token_accuracy": 0.4325797975063324, "num_tokens": 2294115.0, "step": 5220 }, { "epoch": 1.5649311789347697, "grad_norm": 33.25, "learning_rate": 2.3922800718132856e-05, "loss": 2.0421, "mean_token_accuracy": 0.5684062302112579, "num_tokens": 2298189.0, "step": 5230 }, { "epoch": 1.5679233991621784, "grad_norm": 34.0, "learning_rate": 2.3872930381009377e-05, "loss": 2.2966, "mean_token_accuracy": 0.5321805104613304, "num_tokens": 2301941.0, "step": 5240 }, { "epoch": 1.570915619389587, "grad_norm": 28.125, "learning_rate": 2.3823060043885898e-05, "loss": 2.032, "mean_token_accuracy": 0.5710376381874085, "num_tokens": 2306666.0, "step": 5250 }, { "epoch": 1.5739078396169957, "grad_norm": 27.125, "learning_rate": 2.377318970676242e-05, "loss": 2.2798, "mean_token_accuracy": 0.5459753274917603, "num_tokens": 2310667.0, "step": 5260 }, { "epoch": 1.5769000598444045, "grad_norm": 34.0, "learning_rate": 2.372331936963894e-05, "loss": 2.193, "mean_token_accuracy": 0.5457642912864685, "num_tokens": 2315430.0, "step": 5270 }, { "epoch": 1.5798922800718134, "grad_norm": 28.125, "learning_rate": 2.367344903251546e-05, "loss": 2.7639, "mean_token_accuracy": 0.4813458412885666, "num_tokens": 2319719.0, "step": 5280 }, { "epoch": 1.582884500299222, "grad_norm": 24.5, "learning_rate": 2.362357869539198e-05, "loss": 2.7784, "mean_token_accuracy": 0.4711227312684059, "num_tokens": 2323940.0, "step": 5290 }, { "epoch": 1.5858767205266306, "grad_norm": 73.0, "learning_rate": 2.3573708358268502e-05, "loss": 2.5437, "mean_token_accuracy": 0.4930324018001556, "num_tokens": 2327459.0, "step": 5300 }, { "epoch": 1.5888689407540395, "grad_norm": 35.25, "learning_rate": 2.3523838021145026e-05, "loss": 2.4414, "mean_token_accuracy": 0.5069448441267014, "num_tokens": 2333115.0, "step": 5310 }, { "epoch": 1.5918611609814484, "grad_norm": 35.25, "learning_rate": 2.3473967684021544e-05, "loss": 2.5232, "mean_token_accuracy": 0.49047706127166746, "num_tokens": 2337354.0, "step": 5320 }, { "epoch": 1.594853381208857, "grad_norm": 37.0, "learning_rate": 2.3424097346898068e-05, "loss": 2.7698, "mean_token_accuracy": 0.46212276816368103, "num_tokens": 2342548.0, "step": 5330 }, { "epoch": 1.5978456014362656, "grad_norm": 30.25, "learning_rate": 2.337422700977459e-05, "loss": 2.2649, "mean_token_accuracy": 0.538226792216301, "num_tokens": 2347913.0, "step": 5340 }, { "epoch": 1.6008378216636745, "grad_norm": 29.375, "learning_rate": 2.3324356672651106e-05, "loss": 3.0188, "mean_token_accuracy": 0.4409749746322632, "num_tokens": 2351777.0, "step": 5350 }, { "epoch": 1.6038300418910831, "grad_norm": 31.0, "learning_rate": 2.327448633552763e-05, "loss": 2.4968, "mean_token_accuracy": 0.4966483414173126, "num_tokens": 2356737.0, "step": 5360 }, { "epoch": 1.606822262118492, "grad_norm": 35.25, "learning_rate": 2.322461599840415e-05, "loss": 1.9533, "mean_token_accuracy": 0.5769125640392303, "num_tokens": 2359790.0, "step": 5370 }, { "epoch": 1.6098144823459006, "grad_norm": 31.875, "learning_rate": 2.3174745661280672e-05, "loss": 2.5364, "mean_token_accuracy": 0.5159052804112434, "num_tokens": 2364322.0, "step": 5380 }, { "epoch": 1.6128067025733093, "grad_norm": 22.25, "learning_rate": 2.3124875324157193e-05, "loss": 2.3025, "mean_token_accuracy": 0.5282308667898178, "num_tokens": 2368854.0, "step": 5390 }, { "epoch": 1.6157989228007181, "grad_norm": 24.5, "learning_rate": 2.3075004987033713e-05, "loss": 2.1343, "mean_token_accuracy": 0.5515360802412033, "num_tokens": 2372689.0, "step": 5400 }, { "epoch": 1.618791143028127, "grad_norm": 25.0, "learning_rate": 2.3025134649910234e-05, "loss": 2.7252, "mean_token_accuracy": 0.4657323956489563, "num_tokens": 2377410.0, "step": 5410 }, { "epoch": 1.6217833632555356, "grad_norm": 22.625, "learning_rate": 2.2975264312786758e-05, "loss": 2.341, "mean_token_accuracy": 0.5442433267831802, "num_tokens": 2382107.0, "step": 5420 }, { "epoch": 1.6247755834829443, "grad_norm": 31.5, "learning_rate": 2.2925393975663276e-05, "loss": 2.3103, "mean_token_accuracy": 0.5383136540651321, "num_tokens": 2386352.0, "step": 5430 }, { "epoch": 1.6277678037103531, "grad_norm": 26.5, "learning_rate": 2.2875523638539796e-05, "loss": 2.6328, "mean_token_accuracy": 0.48615836501121523, "num_tokens": 2391189.0, "step": 5440 }, { "epoch": 1.630760023937762, "grad_norm": 24.0, "learning_rate": 2.282565330141632e-05, "loss": 2.0362, "mean_token_accuracy": 0.572376748919487, "num_tokens": 2394615.0, "step": 5450 }, { "epoch": 1.6337522441651706, "grad_norm": 34.5, "learning_rate": 2.277578296429284e-05, "loss": 2.1694, "mean_token_accuracy": 0.55944344997406, "num_tokens": 2398123.0, "step": 5460 }, { "epoch": 1.6367444643925793, "grad_norm": 27.625, "learning_rate": 2.272591262716936e-05, "loss": 2.253, "mean_token_accuracy": 0.5339977979660034, "num_tokens": 2402810.0, "step": 5470 }, { "epoch": 1.639736684619988, "grad_norm": 14.8125, "learning_rate": 2.2676042290045883e-05, "loss": 1.9026, "mean_token_accuracy": 0.5830067694187164, "num_tokens": 2407040.0, "step": 5480 }, { "epoch": 1.6427289048473968, "grad_norm": 22.125, "learning_rate": 2.2626171952922404e-05, "loss": 2.4653, "mean_token_accuracy": 0.5064762234687805, "num_tokens": 2411166.0, "step": 5490 }, { "epoch": 1.6457211250748056, "grad_norm": 24.25, "learning_rate": 2.257630161579892e-05, "loss": 2.4491, "mean_token_accuracy": 0.5069942772388458, "num_tokens": 2414282.0, "step": 5500 }, { "epoch": 1.6487133453022143, "grad_norm": 39.25, "learning_rate": 2.2526431278675445e-05, "loss": 2.5615, "mean_token_accuracy": 0.5000170230865478, "num_tokens": 2419330.0, "step": 5510 }, { "epoch": 1.6517055655296229, "grad_norm": 26.375, "learning_rate": 2.2476560941551966e-05, "loss": 1.7894, "mean_token_accuracy": 0.608027520775795, "num_tokens": 2423670.0, "step": 5520 }, { "epoch": 1.6546977857570317, "grad_norm": 23.375, "learning_rate": 2.2426690604428487e-05, "loss": 1.8739, "mean_token_accuracy": 0.5978760778903961, "num_tokens": 2428023.0, "step": 5530 }, { "epoch": 1.6576900059844406, "grad_norm": 26.0, "learning_rate": 2.2376820267305008e-05, "loss": 2.1767, "mean_token_accuracy": 0.5449935585260391, "num_tokens": 2432049.0, "step": 5540 }, { "epoch": 1.6606822262118492, "grad_norm": 39.75, "learning_rate": 2.232694993018153e-05, "loss": 1.9426, "mean_token_accuracy": 0.6162794232368469, "num_tokens": 2435742.0, "step": 5550 }, { "epoch": 1.6636744464392579, "grad_norm": 32.5, "learning_rate": 2.227707959305805e-05, "loss": 1.9529, "mean_token_accuracy": 0.5872822850942612, "num_tokens": 2440036.0, "step": 5560 }, { "epoch": 1.6666666666666665, "grad_norm": 32.75, "learning_rate": 2.2227209255934574e-05, "loss": 2.5299, "mean_token_accuracy": 0.5048285767436027, "num_tokens": 2443798.0, "step": 5570 }, { "epoch": 1.6696588868940754, "grad_norm": 26.875, "learning_rate": 2.217733891881109e-05, "loss": 2.2806, "mean_token_accuracy": 0.5388662964105606, "num_tokens": 2448264.0, "step": 5580 }, { "epoch": 1.6726511071214842, "grad_norm": 43.25, "learning_rate": 2.2127468581687612e-05, "loss": 2.751, "mean_token_accuracy": 0.4652885258197784, "num_tokens": 2453502.0, "step": 5590 }, { "epoch": 1.6756433273488929, "grad_norm": 15.6875, "learning_rate": 2.2077598244564136e-05, "loss": 2.5737, "mean_token_accuracy": 0.5215324550867081, "num_tokens": 2457629.0, "step": 5600 }, { "epoch": 1.6786355475763015, "grad_norm": 35.25, "learning_rate": 2.2027727907440657e-05, "loss": 2.275, "mean_token_accuracy": 0.5492411971092224, "num_tokens": 2463140.0, "step": 5610 }, { "epoch": 1.6816277678037104, "grad_norm": 23.125, "learning_rate": 2.1977857570317174e-05, "loss": 2.227, "mean_token_accuracy": 0.5448118776082993, "num_tokens": 2467905.0, "step": 5620 }, { "epoch": 1.6846199880311192, "grad_norm": 50.75, "learning_rate": 2.19279872331937e-05, "loss": 2.0872, "mean_token_accuracy": 0.5600731402635575, "num_tokens": 2472835.0, "step": 5630 }, { "epoch": 1.6876122082585279, "grad_norm": 22.0, "learning_rate": 2.187811689607022e-05, "loss": 2.7493, "mean_token_accuracy": 0.4741476386785507, "num_tokens": 2477281.0, "step": 5640 }, { "epoch": 1.6906044284859365, "grad_norm": 29.25, "learning_rate": 2.182824655894674e-05, "loss": 2.1632, "mean_token_accuracy": 0.5686624765396118, "num_tokens": 2481606.0, "step": 5650 }, { "epoch": 1.6935966487133451, "grad_norm": 43.5, "learning_rate": 2.177837622182326e-05, "loss": 2.656, "mean_token_accuracy": 0.47470551133155825, "num_tokens": 2485039.0, "step": 5660 }, { "epoch": 1.696588868940754, "grad_norm": 23.125, "learning_rate": 2.172850588469978e-05, "loss": 1.8427, "mean_token_accuracy": 0.625142702460289, "num_tokens": 2490589.0, "step": 5670 }, { "epoch": 1.6995810891681629, "grad_norm": 25.0, "learning_rate": 2.1678635547576302e-05, "loss": 2.5159, "mean_token_accuracy": 0.5088352680206298, "num_tokens": 2494052.0, "step": 5680 }, { "epoch": 1.7025733093955715, "grad_norm": 26.5, "learning_rate": 2.1628765210452823e-05, "loss": 2.5968, "mean_token_accuracy": 0.47904575169086455, "num_tokens": 2498952.0, "step": 5690 }, { "epoch": 1.7055655296229801, "grad_norm": 46.75, "learning_rate": 2.1578894873329344e-05, "loss": 3.29, "mean_token_accuracy": 0.4176428884267807, "num_tokens": 2503107.0, "step": 5700 }, { "epoch": 1.708557749850389, "grad_norm": 22.875, "learning_rate": 2.1529024536205865e-05, "loss": 1.9785, "mean_token_accuracy": 0.5829225838184356, "num_tokens": 2507193.0, "step": 5710 }, { "epoch": 1.7115499700777979, "grad_norm": 33.5, "learning_rate": 2.147915419908239e-05, "loss": 2.8111, "mean_token_accuracy": 0.4594015315175056, "num_tokens": 2511145.0, "step": 5720 }, { "epoch": 1.7145421903052065, "grad_norm": 39.5, "learning_rate": 2.1429283861958906e-05, "loss": 2.2739, "mean_token_accuracy": 0.5496653497219086, "num_tokens": 2516512.0, "step": 5730 }, { "epoch": 1.7175344105326151, "grad_norm": 23.625, "learning_rate": 2.137941352483543e-05, "loss": 2.4269, "mean_token_accuracy": 0.4974661499261856, "num_tokens": 2521653.0, "step": 5740 }, { "epoch": 1.720526630760024, "grad_norm": 25.125, "learning_rate": 2.132954318771195e-05, "loss": 2.8225, "mean_token_accuracy": 0.4561440125107765, "num_tokens": 2525305.0, "step": 5750 }, { "epoch": 1.7235188509874326, "grad_norm": 49.25, "learning_rate": 2.1279672850588472e-05, "loss": 2.308, "mean_token_accuracy": 0.5274252116680145, "num_tokens": 2528945.0, "step": 5760 }, { "epoch": 1.7265110712148415, "grad_norm": 21.625, "learning_rate": 2.1229802513464993e-05, "loss": 2.226, "mean_token_accuracy": 0.5421534836292267, "num_tokens": 2533763.0, "step": 5770 }, { "epoch": 1.7295032914422501, "grad_norm": 36.25, "learning_rate": 2.1179932176341514e-05, "loss": 2.2728, "mean_token_accuracy": 0.5372824460268021, "num_tokens": 2536930.0, "step": 5780 }, { "epoch": 1.7324955116696588, "grad_norm": 24.625, "learning_rate": 2.1130061839218034e-05, "loss": 2.1263, "mean_token_accuracy": 0.5437184333801269, "num_tokens": 2542163.0, "step": 5790 }, { "epoch": 1.7354877318970676, "grad_norm": 31.75, "learning_rate": 2.1080191502094555e-05, "loss": 2.5627, "mean_token_accuracy": 0.4753000020980835, "num_tokens": 2545587.0, "step": 5800 }, { "epoch": 1.7384799521244765, "grad_norm": 43.0, "learning_rate": 2.1030321164971076e-05, "loss": 2.1084, "mean_token_accuracy": 0.5742941737174988, "num_tokens": 2550025.0, "step": 5810 }, { "epoch": 1.7414721723518851, "grad_norm": 31.25, "learning_rate": 2.0980450827847597e-05, "loss": 2.0954, "mean_token_accuracy": 0.5463778793811798, "num_tokens": 2554373.0, "step": 5820 }, { "epoch": 1.7444643925792938, "grad_norm": 35.25, "learning_rate": 2.093058049072412e-05, "loss": 2.197, "mean_token_accuracy": 0.5357052236795425, "num_tokens": 2558090.0, "step": 5830 }, { "epoch": 1.7474566128067026, "grad_norm": 31.125, "learning_rate": 2.088071015360064e-05, "loss": 2.4045, "mean_token_accuracy": 0.5454428046941757, "num_tokens": 2562281.0, "step": 5840 }, { "epoch": 1.7504488330341115, "grad_norm": 17.0, "learning_rate": 2.083083981647716e-05, "loss": 2.6007, "mean_token_accuracy": 0.5035086557269096, "num_tokens": 2567346.0, "step": 5850 }, { "epoch": 1.75344105326152, "grad_norm": 18.5, "learning_rate": 2.0780969479353683e-05, "loss": 2.5044, "mean_token_accuracy": 0.5108116358518601, "num_tokens": 2571166.0, "step": 5860 }, { "epoch": 1.7564332734889287, "grad_norm": 24.125, "learning_rate": 2.0731099142230204e-05, "loss": 2.4193, "mean_token_accuracy": 0.5259674608707428, "num_tokens": 2577029.0, "step": 5870 }, { "epoch": 1.7594254937163374, "grad_norm": 48.5, "learning_rate": 2.068122880510672e-05, "loss": 2.5751, "mean_token_accuracy": 0.5132578700780869, "num_tokens": 2581099.0, "step": 5880 }, { "epoch": 1.7624177139437462, "grad_norm": 12.625, "learning_rate": 2.0631358467983246e-05, "loss": 2.575, "mean_token_accuracy": 0.528558287024498, "num_tokens": 2585381.0, "step": 5890 }, { "epoch": 1.765409934171155, "grad_norm": 63.75, "learning_rate": 2.0581488130859767e-05, "loss": 2.1645, "mean_token_accuracy": 0.558303925395012, "num_tokens": 2588571.0, "step": 5900 }, { "epoch": 1.7684021543985637, "grad_norm": 24.125, "learning_rate": 2.0531617793736287e-05, "loss": 2.2552, "mean_token_accuracy": 0.5411330461502075, "num_tokens": 2593493.0, "step": 5910 }, { "epoch": 1.7713943746259724, "grad_norm": 41.0, "learning_rate": 2.0481747456612808e-05, "loss": 2.4276, "mean_token_accuracy": 0.535361161828041, "num_tokens": 2597065.0, "step": 5920 }, { "epoch": 1.7743865948533812, "grad_norm": 36.5, "learning_rate": 2.043187711948933e-05, "loss": 2.292, "mean_token_accuracy": 0.534418734908104, "num_tokens": 2602326.0, "step": 5930 }, { "epoch": 1.77737881508079, "grad_norm": 37.75, "learning_rate": 2.038200678236585e-05, "loss": 2.5938, "mean_token_accuracy": 0.48518282175064087, "num_tokens": 2605919.0, "step": 5940 }, { "epoch": 1.7803710353081987, "grad_norm": 44.5, "learning_rate": 2.033213644524237e-05, "loss": 2.2369, "mean_token_accuracy": 0.5367230206727982, "num_tokens": 2609223.0, "step": 5950 }, { "epoch": 1.7833632555356074, "grad_norm": 25.25, "learning_rate": 2.028226610811889e-05, "loss": 2.0652, "mean_token_accuracy": 0.5805120468139648, "num_tokens": 2613758.0, "step": 5960 }, { "epoch": 1.786355475763016, "grad_norm": 23.25, "learning_rate": 2.0232395770995412e-05, "loss": 2.5217, "mean_token_accuracy": 0.48961347341537476, "num_tokens": 2618111.0, "step": 5970 }, { "epoch": 1.7893476959904249, "grad_norm": 11.5, "learning_rate": 2.0182525433871936e-05, "loss": 1.8521, "mean_token_accuracy": 0.6123942375183106, "num_tokens": 2623243.0, "step": 5980 }, { "epoch": 1.7923399162178337, "grad_norm": 32.25, "learning_rate": 2.0132655096748454e-05, "loss": 2.1077, "mean_token_accuracy": 0.5779639840126037, "num_tokens": 2627941.0, "step": 5990 }, { "epoch": 1.7953321364452424, "grad_norm": 40.75, "learning_rate": 2.0082784759624974e-05, "loss": 2.5294, "mean_token_accuracy": 0.5101372256875039, "num_tokens": 2632788.0, "step": 6000 }, { "epoch": 1.798324356672651, "grad_norm": 24.375, "learning_rate": 2.00329144225015e-05, "loss": 2.2084, "mean_token_accuracy": 0.56387078166008, "num_tokens": 2638259.0, "step": 6010 }, { "epoch": 1.8013165769000599, "grad_norm": 44.0, "learning_rate": 1.998304408537802e-05, "loss": 2.9581, "mean_token_accuracy": 0.42872937619686124, "num_tokens": 2642095.0, "step": 6020 }, { "epoch": 1.8043087971274687, "grad_norm": 25.5, "learning_rate": 1.9933173748254537e-05, "loss": 2.7151, "mean_token_accuracy": 0.4665300458669662, "num_tokens": 2647272.0, "step": 6030 }, { "epoch": 1.8073010173548774, "grad_norm": 30.375, "learning_rate": 1.988330341113106e-05, "loss": 2.6866, "mean_token_accuracy": 0.477365055680275, "num_tokens": 2650970.0, "step": 6040 }, { "epoch": 1.810293237582286, "grad_norm": 20.25, "learning_rate": 1.9833433074007582e-05, "loss": 2.5152, "mean_token_accuracy": 0.5005413472652436, "num_tokens": 2655297.0, "step": 6050 }, { "epoch": 1.8132854578096946, "grad_norm": 25.375, "learning_rate": 1.9783562736884103e-05, "loss": 2.0359, "mean_token_accuracy": 0.5744021505117416, "num_tokens": 2659789.0, "step": 6060 }, { "epoch": 1.8162776780371035, "grad_norm": 34.25, "learning_rate": 1.9733692399760623e-05, "loss": 2.5769, "mean_token_accuracy": 0.49441861510276797, "num_tokens": 2664682.0, "step": 6070 }, { "epoch": 1.8192698982645124, "grad_norm": 31.0, "learning_rate": 1.9683822062637144e-05, "loss": 2.8461, "mean_token_accuracy": 0.4470598191022873, "num_tokens": 2668051.0, "step": 6080 }, { "epoch": 1.822262118491921, "grad_norm": 33.75, "learning_rate": 1.9633951725513665e-05, "loss": 2.2639, "mean_token_accuracy": 0.5656882345676422, "num_tokens": 2673035.0, "step": 6090 }, { "epoch": 1.8252543387193296, "grad_norm": 33.5, "learning_rate": 1.9584081388390186e-05, "loss": 2.376, "mean_token_accuracy": 0.5445163309574127, "num_tokens": 2676665.0, "step": 6100 }, { "epoch": 1.8282465589467385, "grad_norm": 28.25, "learning_rate": 1.9534211051266707e-05, "loss": 2.2772, "mean_token_accuracy": 0.5421164140105248, "num_tokens": 2680076.0, "step": 6110 }, { "epoch": 1.8312387791741473, "grad_norm": 20.625, "learning_rate": 1.9484340714143227e-05, "loss": 2.0009, "mean_token_accuracy": 0.5641443014144898, "num_tokens": 2685756.0, "step": 6120 }, { "epoch": 1.834230999401556, "grad_norm": 13.0625, "learning_rate": 1.943447037701975e-05, "loss": 2.0111, "mean_token_accuracy": 0.6003489851951599, "num_tokens": 2690244.0, "step": 6130 }, { "epoch": 1.8372232196289646, "grad_norm": 20.25, "learning_rate": 1.938460003989627e-05, "loss": 2.4381, "mean_token_accuracy": 0.5133287519216537, "num_tokens": 2694430.0, "step": 6140 }, { "epoch": 1.8402154398563735, "grad_norm": 17.125, "learning_rate": 1.933472970277279e-05, "loss": 2.2444, "mean_token_accuracy": 0.5365898728370666, "num_tokens": 2700217.0, "step": 6150 }, { "epoch": 1.8432076600837821, "grad_norm": 31.0, "learning_rate": 1.9284859365649314e-05, "loss": 2.0314, "mean_token_accuracy": 0.5679359257221221, "num_tokens": 2702801.0, "step": 6160 }, { "epoch": 1.846199880311191, "grad_norm": 42.0, "learning_rate": 1.9234989028525835e-05, "loss": 2.7164, "mean_token_accuracy": 0.4744591027498245, "num_tokens": 2707886.0, "step": 6170 }, { "epoch": 1.8491921005385996, "grad_norm": 38.5, "learning_rate": 1.9185118691402356e-05, "loss": 2.7306, "mean_token_accuracy": 0.4657700195908546, "num_tokens": 2712105.0, "step": 6180 }, { "epoch": 1.8521843207660083, "grad_norm": 37.25, "learning_rate": 1.9135248354278876e-05, "loss": 2.7078, "mean_token_accuracy": 0.46402439177036287, "num_tokens": 2716450.0, "step": 6190 }, { "epoch": 1.8551765409934171, "grad_norm": 32.75, "learning_rate": 1.9085378017155397e-05, "loss": 2.1767, "mean_token_accuracy": 0.5529489815235138, "num_tokens": 2720919.0, "step": 6200 }, { "epoch": 1.858168761220826, "grad_norm": 24.875, "learning_rate": 1.9035507680031918e-05, "loss": 2.7361, "mean_token_accuracy": 0.4729373618960381, "num_tokens": 2724921.0, "step": 6210 }, { "epoch": 1.8611609814482346, "grad_norm": 28.0, "learning_rate": 1.898563734290844e-05, "loss": 2.3484, "mean_token_accuracy": 0.5261541813611984, "num_tokens": 2728812.0, "step": 6220 }, { "epoch": 1.8641532016756432, "grad_norm": 32.5, "learning_rate": 1.893576700578496e-05, "loss": 2.6856, "mean_token_accuracy": 0.48226538598537444, "num_tokens": 2734365.0, "step": 6230 }, { "epoch": 1.867145421903052, "grad_norm": 37.25, "learning_rate": 1.888589666866148e-05, "loss": 1.9228, "mean_token_accuracy": 0.6168141156435013, "num_tokens": 2738466.0, "step": 6240 }, { "epoch": 1.870137642130461, "grad_norm": 20.625, "learning_rate": 1.8836026331538e-05, "loss": 1.9347, "mean_token_accuracy": 0.5970963388681412, "num_tokens": 2742084.0, "step": 6250 }, { "epoch": 1.8731298623578696, "grad_norm": 30.875, "learning_rate": 1.8786155994414522e-05, "loss": 2.3609, "mean_token_accuracy": 0.5621721386909485, "num_tokens": 2747004.0, "step": 6260 }, { "epoch": 1.8761220825852782, "grad_norm": 24.75, "learning_rate": 1.8736285657291046e-05, "loss": 2.7582, "mean_token_accuracy": 0.4766868263483047, "num_tokens": 2751277.0, "step": 6270 }, { "epoch": 1.8791143028126869, "grad_norm": 31.125, "learning_rate": 1.8686415320167567e-05, "loss": 2.1709, "mean_token_accuracy": 0.5493841350078583, "num_tokens": 2754736.0, "step": 6280 }, { "epoch": 1.8821065230400957, "grad_norm": 29.5, "learning_rate": 1.8636544983044084e-05, "loss": 2.4101, "mean_token_accuracy": 0.5184233725070954, "num_tokens": 2759579.0, "step": 6290 }, { "epoch": 1.8850987432675046, "grad_norm": 43.25, "learning_rate": 1.858667464592061e-05, "loss": 2.7502, "mean_token_accuracy": 0.4661740347743034, "num_tokens": 2763344.0, "step": 6300 }, { "epoch": 1.8880909634949132, "grad_norm": 39.0, "learning_rate": 1.853680430879713e-05, "loss": 1.918, "mean_token_accuracy": 0.5811412990093231, "num_tokens": 2766322.0, "step": 6310 }, { "epoch": 1.8910831837223219, "grad_norm": 27.375, "learning_rate": 1.848693397167365e-05, "loss": 1.6647, "mean_token_accuracy": 0.6429639548063278, "num_tokens": 2770347.0, "step": 6320 }, { "epoch": 1.8940754039497307, "grad_norm": 34.0, "learning_rate": 1.843706363455017e-05, "loss": 2.7445, "mean_token_accuracy": 0.4729799941182137, "num_tokens": 2775124.0, "step": 6330 }, { "epoch": 1.8970676241771396, "grad_norm": 29.875, "learning_rate": 1.838719329742669e-05, "loss": 2.3141, "mean_token_accuracy": 0.5422122359275818, "num_tokens": 2779961.0, "step": 6340 }, { "epoch": 1.9000598444045482, "grad_norm": 23.5, "learning_rate": 1.8337322960303212e-05, "loss": 2.2344, "mean_token_accuracy": 0.5411304891109466, "num_tokens": 2783692.0, "step": 6350 }, { "epoch": 1.9030520646319569, "grad_norm": 30.625, "learning_rate": 1.8287452623179737e-05, "loss": 1.9572, "mean_token_accuracy": 0.5806999713182449, "num_tokens": 2788229.0, "step": 6360 }, { "epoch": 1.9060442848593655, "grad_norm": 45.75, "learning_rate": 1.8237582286056254e-05, "loss": 2.02, "mean_token_accuracy": 0.596290436387062, "num_tokens": 2792558.0, "step": 6370 }, { "epoch": 1.9090365050867744, "grad_norm": 40.0, "learning_rate": 1.8187711948932775e-05, "loss": 1.9781, "mean_token_accuracy": 0.5878248095512391, "num_tokens": 2797147.0, "step": 6380 }, { "epoch": 1.9120287253141832, "grad_norm": 20.125, "learning_rate": 1.81378416118093e-05, "loss": 2.3107, "mean_token_accuracy": 0.5291865646839142, "num_tokens": 2801640.0, "step": 6390 }, { "epoch": 1.9150209455415919, "grad_norm": 28.625, "learning_rate": 1.8087971274685816e-05, "loss": 1.8296, "mean_token_accuracy": 0.6036499828100205, "num_tokens": 2806291.0, "step": 6400 }, { "epoch": 1.9180131657690005, "grad_norm": 25.125, "learning_rate": 1.8038100937562337e-05, "loss": 1.9843, "mean_token_accuracy": 0.5820193707942962, "num_tokens": 2810023.0, "step": 6410 }, { "epoch": 1.9210053859964094, "grad_norm": 27.625, "learning_rate": 1.798823060043886e-05, "loss": 2.4543, "mean_token_accuracy": 0.5143000841140747, "num_tokens": 2814985.0, "step": 6420 }, { "epoch": 1.9239976062238182, "grad_norm": 30.0, "learning_rate": 1.7938360263315382e-05, "loss": 2.068, "mean_token_accuracy": 0.5631296753883361, "num_tokens": 2820578.0, "step": 6430 }, { "epoch": 1.9269898264512269, "grad_norm": 31.125, "learning_rate": 1.78884899261919e-05, "loss": 2.5761, "mean_token_accuracy": 0.48471910059452056, "num_tokens": 2824205.0, "step": 6440 }, { "epoch": 1.9299820466786355, "grad_norm": 26.5, "learning_rate": 1.7838619589068424e-05, "loss": 2.3299, "mean_token_accuracy": 0.5316969901323318, "num_tokens": 2828031.0, "step": 6450 }, { "epoch": 1.9329742669060441, "grad_norm": 43.75, "learning_rate": 1.7788749251944945e-05, "loss": 2.7208, "mean_token_accuracy": 0.49289258420467374, "num_tokens": 2831445.0, "step": 6460 }, { "epoch": 1.935966487133453, "grad_norm": 19.375, "learning_rate": 1.7738878914821465e-05, "loss": 1.7553, "mean_token_accuracy": 0.6282897502183914, "num_tokens": 2835771.0, "step": 6470 }, { "epoch": 1.9389587073608618, "grad_norm": 20.875, "learning_rate": 1.7689008577697986e-05, "loss": 2.1944, "mean_token_accuracy": 0.5652035892009735, "num_tokens": 2840434.0, "step": 6480 }, { "epoch": 1.9419509275882705, "grad_norm": 22.25, "learning_rate": 1.7639138240574507e-05, "loss": 2.1051, "mean_token_accuracy": 0.5805429667234421, "num_tokens": 2843900.0, "step": 6490 }, { "epoch": 1.9449431478156791, "grad_norm": 27.25, "learning_rate": 1.7589267903451028e-05, "loss": 2.7318, "mean_token_accuracy": 0.5218796044588089, "num_tokens": 2849886.0, "step": 6500 }, { "epoch": 1.947935368043088, "grad_norm": 33.25, "learning_rate": 1.7539397566327552e-05, "loss": 2.5077, "mean_token_accuracy": 0.5040024489164352, "num_tokens": 2852596.0, "step": 6510 }, { "epoch": 1.9509275882704968, "grad_norm": 35.5, "learning_rate": 1.748952722920407e-05, "loss": 1.7998, "mean_token_accuracy": 0.6108117759227752, "num_tokens": 2856341.0, "step": 6520 }, { "epoch": 1.9539198084979055, "grad_norm": 23.75, "learning_rate": 1.743965689208059e-05, "loss": 2.3724, "mean_token_accuracy": 0.5622206628322601, "num_tokens": 2859928.0, "step": 6530 }, { "epoch": 1.9569120287253141, "grad_norm": 26.375, "learning_rate": 1.7389786554957114e-05, "loss": 2.2704, "mean_token_accuracy": 0.549874222278595, "num_tokens": 2865567.0, "step": 6540 }, { "epoch": 1.959904248952723, "grad_norm": 46.75, "learning_rate": 1.733991621783363e-05, "loss": 2.0852, "mean_token_accuracy": 0.5803373664617538, "num_tokens": 2869227.0, "step": 6550 }, { "epoch": 1.9628964691801316, "grad_norm": 36.5, "learning_rate": 1.7290045880710152e-05, "loss": 2.8152, "mean_token_accuracy": 0.46767871379852294, "num_tokens": 2874440.0, "step": 6560 }, { "epoch": 1.9658886894075405, "grad_norm": 13.25, "learning_rate": 1.7240175543586677e-05, "loss": 1.9414, "mean_token_accuracy": 0.5902773320674897, "num_tokens": 2878799.0, "step": 6570 }, { "epoch": 1.968880909634949, "grad_norm": 38.0, "learning_rate": 1.7190305206463197e-05, "loss": 2.8453, "mean_token_accuracy": 0.4565995991230011, "num_tokens": 2882617.0, "step": 6580 }, { "epoch": 1.9718731298623577, "grad_norm": 27.75, "learning_rate": 1.7140434869339718e-05, "loss": 2.3815, "mean_token_accuracy": 0.5338667601346969, "num_tokens": 2887111.0, "step": 6590 }, { "epoch": 1.9748653500897666, "grad_norm": 39.75, "learning_rate": 1.709056453221624e-05, "loss": 2.4473, "mean_token_accuracy": 0.5243928819894791, "num_tokens": 2891051.0, "step": 6600 }, { "epoch": 1.9778575703171755, "grad_norm": 24.375, "learning_rate": 1.704069419509276e-05, "loss": 2.0229, "mean_token_accuracy": 0.5819611266255379, "num_tokens": 2895900.0, "step": 6610 }, { "epoch": 1.980849790544584, "grad_norm": 45.25, "learning_rate": 1.699082385796928e-05, "loss": 1.8578, "mean_token_accuracy": 0.6056030809879303, "num_tokens": 2901008.0, "step": 6620 }, { "epoch": 1.9838420107719927, "grad_norm": 29.625, "learning_rate": 1.69409535208458e-05, "loss": 2.2429, "mean_token_accuracy": 0.5784931719303131, "num_tokens": 2905651.0, "step": 6630 }, { "epoch": 1.9868342309994016, "grad_norm": 29.875, "learning_rate": 1.6891083183722322e-05, "loss": 2.0341, "mean_token_accuracy": 0.5632521450519562, "num_tokens": 2909307.0, "step": 6640 }, { "epoch": 1.9898264512268105, "grad_norm": 37.0, "learning_rate": 1.6841212846598843e-05, "loss": 2.0049, "mean_token_accuracy": 0.5811235994100571, "num_tokens": 2913081.0, "step": 6650 }, { "epoch": 1.992818671454219, "grad_norm": 28.5, "learning_rate": 1.6791342509475367e-05, "loss": 2.6649, "mean_token_accuracy": 0.4999321013689041, "num_tokens": 2918402.0, "step": 6660 }, { "epoch": 1.9958108916816277, "grad_norm": 29.0, "learning_rate": 1.6741472172351885e-05, "loss": 2.0443, "mean_token_accuracy": 0.5667910605669022, "num_tokens": 2923258.0, "step": 6670 }, { "epoch": 1.9988031119090364, "grad_norm": 26.875, "learning_rate": 1.6691601835228405e-05, "loss": 1.8138, "mean_token_accuracy": 0.629214882850647, "num_tokens": 2926579.0, "step": 6680 }, { "epoch": 2.0, "eval_loss": 2.242931842803955, "eval_mean_token_accuracy": 0.5522286700752547, "eval_num_tokens": 2928488.0, "eval_runtime": 28.1437, "eval_samples_per_second": 14.852, "eval_steps_per_second": 1.883, "step": 6684 }, { "epoch": 2.0017953321364454, "grad_norm": 31.125, "learning_rate": 1.664173149810493e-05, "loss": 1.2534, "mean_token_accuracy": 0.7216873943805695, "num_tokens": 2930316.0, "step": 6690 }, { "epoch": 2.004787552363854, "grad_norm": 19.875, "learning_rate": 1.6591861160981447e-05, "loss": 2.1657, "mean_token_accuracy": 0.5659398257732391, "num_tokens": 2935350.0, "step": 6700 }, { "epoch": 2.0077797725912627, "grad_norm": 25.125, "learning_rate": 1.654199082385797e-05, "loss": 1.9931, "mean_token_accuracy": 0.5885084450244904, "num_tokens": 2939609.0, "step": 6710 }, { "epoch": 2.0107719928186714, "grad_norm": 29.25, "learning_rate": 1.6492120486734492e-05, "loss": 2.232, "mean_token_accuracy": 0.5317733138799667, "num_tokens": 2944532.0, "step": 6720 }, { "epoch": 2.01376421304608, "grad_norm": 33.25, "learning_rate": 1.6442250149611013e-05, "loss": 2.0345, "mean_token_accuracy": 0.5766903549432755, "num_tokens": 2948542.0, "step": 6730 }, { "epoch": 2.016756433273489, "grad_norm": 26.5, "learning_rate": 1.6392379812487534e-05, "loss": 2.2125, "mean_token_accuracy": 0.5706226840615273, "num_tokens": 2953419.0, "step": 6740 }, { "epoch": 2.0197486535008977, "grad_norm": 33.5, "learning_rate": 1.6342509475364054e-05, "loss": 1.7447, "mean_token_accuracy": 0.633513942360878, "num_tokens": 2956668.0, "step": 6750 }, { "epoch": 2.0227408737283064, "grad_norm": 33.0, "learning_rate": 1.6292639138240575e-05, "loss": 1.6458, "mean_token_accuracy": 0.6448787659406662, "num_tokens": 2959587.0, "step": 6760 }, { "epoch": 2.025733093955715, "grad_norm": 27.125, "learning_rate": 1.6242768801117096e-05, "loss": 2.2686, "mean_token_accuracy": 0.5233890146017075, "num_tokens": 2964296.0, "step": 6770 }, { "epoch": 2.028725314183124, "grad_norm": 50.0, "learning_rate": 1.6192898463993617e-05, "loss": 1.961, "mean_token_accuracy": 0.5838585048913956, "num_tokens": 2968026.0, "step": 6780 }, { "epoch": 2.0317175344105327, "grad_norm": 14.0625, "learning_rate": 1.6143028126870137e-05, "loss": 2.2359, "mean_token_accuracy": 0.550393459200859, "num_tokens": 2974380.0, "step": 6790 }, { "epoch": 2.0347097546379413, "grad_norm": 37.0, "learning_rate": 1.609315778974666e-05, "loss": 1.8793, "mean_token_accuracy": 0.6059096574783325, "num_tokens": 2980634.0, "step": 6800 }, { "epoch": 2.03770197486535, "grad_norm": 18.75, "learning_rate": 1.604328745262318e-05, "loss": 1.7964, "mean_token_accuracy": 0.6337763130664825, "num_tokens": 2984793.0, "step": 6810 }, { "epoch": 2.0406941950927586, "grad_norm": 21.875, "learning_rate": 1.59934171154997e-05, "loss": 1.7028, "mean_token_accuracy": 0.6147032797336578, "num_tokens": 2989861.0, "step": 6820 }, { "epoch": 2.0436864153201677, "grad_norm": 34.0, "learning_rate": 1.5943546778376224e-05, "loss": 2.1306, "mean_token_accuracy": 0.5623955458402634, "num_tokens": 2995027.0, "step": 6830 }, { "epoch": 2.0466786355475763, "grad_norm": 44.25, "learning_rate": 1.5893676441252745e-05, "loss": 2.3411, "mean_token_accuracy": 0.5126163840293885, "num_tokens": 2999046.0, "step": 6840 }, { "epoch": 2.049670855774985, "grad_norm": 40.0, "learning_rate": 1.5843806104129262e-05, "loss": 2.2722, "mean_token_accuracy": 0.5325024276971817, "num_tokens": 3003260.0, "step": 6850 }, { "epoch": 2.0526630760023936, "grad_norm": 31.0, "learning_rate": 1.5793935767005786e-05, "loss": 2.1896, "mean_token_accuracy": 0.5521072208881378, "num_tokens": 3007547.0, "step": 6860 }, { "epoch": 2.0556552962298027, "grad_norm": 68.0, "learning_rate": 1.5744065429882307e-05, "loss": 2.0907, "mean_token_accuracy": 0.5546874791383744, "num_tokens": 3011215.0, "step": 6870 }, { "epoch": 2.0586475164572113, "grad_norm": 32.75, "learning_rate": 1.5694195092758828e-05, "loss": 2.5391, "mean_token_accuracy": 0.486826691031456, "num_tokens": 3015832.0, "step": 6880 }, { "epoch": 2.06163973668462, "grad_norm": 26.875, "learning_rate": 1.564432475563535e-05, "loss": 2.0941, "mean_token_accuracy": 0.5605701714754104, "num_tokens": 3020382.0, "step": 6890 }, { "epoch": 2.0646319569120286, "grad_norm": 27.75, "learning_rate": 1.559445441851187e-05, "loss": 1.8473, "mean_token_accuracy": 0.6215524405241013, "num_tokens": 3024830.0, "step": 6900 }, { "epoch": 2.0676241771394377, "grad_norm": 17.0, "learning_rate": 1.554458408138839e-05, "loss": 1.8716, "mean_token_accuracy": 0.5953522980213165, "num_tokens": 3029463.0, "step": 6910 }, { "epoch": 2.0706163973668463, "grad_norm": 30.375, "learning_rate": 1.5494713744264915e-05, "loss": 1.8055, "mean_token_accuracy": 0.6283937364816665, "num_tokens": 3033927.0, "step": 6920 }, { "epoch": 2.073608617594255, "grad_norm": 24.875, "learning_rate": 1.5444843407141432e-05, "loss": 1.8005, "mean_token_accuracy": 0.6029859393835068, "num_tokens": 3037770.0, "step": 6930 }, { "epoch": 2.0766008378216636, "grad_norm": 31.5, "learning_rate": 1.5394973070017953e-05, "loss": 1.9557, "mean_token_accuracy": 0.5977991700172425, "num_tokens": 3041979.0, "step": 6940 }, { "epoch": 2.0795930580490722, "grad_norm": 39.25, "learning_rate": 1.5345102732894477e-05, "loss": 2.1927, "mean_token_accuracy": 0.5491878598928451, "num_tokens": 3045573.0, "step": 6950 }, { "epoch": 2.0825852782764813, "grad_norm": 30.5, "learning_rate": 1.5295232395770994e-05, "loss": 2.5016, "mean_token_accuracy": 0.5196672089397907, "num_tokens": 3050255.0, "step": 6960 }, { "epoch": 2.08557749850389, "grad_norm": 30.0, "learning_rate": 1.5245362058647517e-05, "loss": 2.4885, "mean_token_accuracy": 0.49733306765556334, "num_tokens": 3055615.0, "step": 6970 }, { "epoch": 2.0885697187312986, "grad_norm": 27.875, "learning_rate": 1.5195491721524038e-05, "loss": 2.1265, "mean_token_accuracy": 0.5600803822278977, "num_tokens": 3060218.0, "step": 6980 }, { "epoch": 2.0915619389587072, "grad_norm": 36.5, "learning_rate": 1.514562138440056e-05, "loss": 2.8598, "mean_token_accuracy": 0.45551442801952363, "num_tokens": 3063650.0, "step": 6990 }, { "epoch": 2.0945541591861163, "grad_norm": 23.625, "learning_rate": 1.509575104727708e-05, "loss": 2.1769, "mean_token_accuracy": 0.5520025372505188, "num_tokens": 3067429.0, "step": 7000 }, { "epoch": 2.097546379413525, "grad_norm": 26.875, "learning_rate": 1.5045880710153602e-05, "loss": 2.2834, "mean_token_accuracy": 0.5472053974866867, "num_tokens": 3070937.0, "step": 7010 }, { "epoch": 2.1005385996409336, "grad_norm": 27.375, "learning_rate": 1.4996010373030123e-05, "loss": 2.2649, "mean_token_accuracy": 0.5259867966175079, "num_tokens": 3076459.0, "step": 7020 }, { "epoch": 2.1035308198683422, "grad_norm": 25.875, "learning_rate": 1.4946140035906645e-05, "loss": 2.0102, "mean_token_accuracy": 0.5812536478042603, "num_tokens": 3080734.0, "step": 7030 }, { "epoch": 2.106523040095751, "grad_norm": 43.5, "learning_rate": 1.4896269698783164e-05, "loss": 2.2867, "mean_token_accuracy": 0.5594320222735405, "num_tokens": 3084628.0, "step": 7040 }, { "epoch": 2.10951526032316, "grad_norm": 35.0, "learning_rate": 1.4846399361659685e-05, "loss": 2.0889, "mean_token_accuracy": 0.5646486788988113, "num_tokens": 3089221.0, "step": 7050 }, { "epoch": 2.1125074805505686, "grad_norm": 35.0, "learning_rate": 1.4796529024536207e-05, "loss": 1.4921, "mean_token_accuracy": 0.6738329261541367, "num_tokens": 3093896.0, "step": 7060 }, { "epoch": 2.115499700777977, "grad_norm": 30.5, "learning_rate": 1.4746658687412728e-05, "loss": 2.18, "mean_token_accuracy": 0.5686471343040467, "num_tokens": 3099829.0, "step": 7070 }, { "epoch": 2.118491921005386, "grad_norm": 25.0, "learning_rate": 1.4696788350289247e-05, "loss": 2.137, "mean_token_accuracy": 0.5568736657500267, "num_tokens": 3104969.0, "step": 7080 }, { "epoch": 2.121484141232795, "grad_norm": 36.25, "learning_rate": 1.464691801316577e-05, "loss": 1.5476, "mean_token_accuracy": 0.6572899341583252, "num_tokens": 3108655.0, "step": 7090 }, { "epoch": 2.1244763614602036, "grad_norm": 26.5, "learning_rate": 1.459704767604229e-05, "loss": 2.1881, "mean_token_accuracy": 0.5540028423070907, "num_tokens": 3114053.0, "step": 7100 }, { "epoch": 2.127468581687612, "grad_norm": 24.125, "learning_rate": 1.454717733891881e-05, "loss": 1.575, "mean_token_accuracy": 0.6586545079946518, "num_tokens": 3119135.0, "step": 7110 }, { "epoch": 2.130460801915021, "grad_norm": 45.75, "learning_rate": 1.4497307001795332e-05, "loss": 1.9054, "mean_token_accuracy": 0.6022154986858368, "num_tokens": 3123334.0, "step": 7120 }, { "epoch": 2.1334530221424295, "grad_norm": 26.75, "learning_rate": 1.4447436664671855e-05, "loss": 1.6995, "mean_token_accuracy": 0.6492284148931503, "num_tokens": 3128890.0, "step": 7130 }, { "epoch": 2.1364452423698386, "grad_norm": 5.46875, "learning_rate": 1.4397566327548375e-05, "loss": 2.4091, "mean_token_accuracy": 0.5174450129270554, "num_tokens": 3133639.0, "step": 7140 }, { "epoch": 2.139437462597247, "grad_norm": 28.75, "learning_rate": 1.4347695990424895e-05, "loss": 2.2288, "mean_token_accuracy": 0.5391303896903992, "num_tokens": 3137904.0, "step": 7150 }, { "epoch": 2.142429682824656, "grad_norm": 30.25, "learning_rate": 1.4297825653301417e-05, "loss": 1.524, "mean_token_accuracy": 0.6681494385004043, "num_tokens": 3143124.0, "step": 7160 }, { "epoch": 2.1454219030520645, "grad_norm": 30.0, "learning_rate": 1.4247955316177938e-05, "loss": 1.8881, "mean_token_accuracy": 0.608069297671318, "num_tokens": 3147219.0, "step": 7170 }, { "epoch": 2.1484141232794736, "grad_norm": 26.625, "learning_rate": 1.419808497905446e-05, "loss": 1.9308, "mean_token_accuracy": 0.5991872996091843, "num_tokens": 3151863.0, "step": 7180 }, { "epoch": 2.151406343506882, "grad_norm": 39.75, "learning_rate": 1.414821464193098e-05, "loss": 1.6427, "mean_token_accuracy": 0.6218064963817597, "num_tokens": 3155373.0, "step": 7190 }, { "epoch": 2.154398563734291, "grad_norm": 34.5, "learning_rate": 1.40983443048075e-05, "loss": 2.276, "mean_token_accuracy": 0.545945143699646, "num_tokens": 3159495.0, "step": 7200 }, { "epoch": 2.1573907839616995, "grad_norm": 26.25, "learning_rate": 1.4048473967684023e-05, "loss": 1.938, "mean_token_accuracy": 0.5874078899621964, "num_tokens": 3164398.0, "step": 7210 }, { "epoch": 2.160383004189108, "grad_norm": 23.25, "learning_rate": 1.3998603630560545e-05, "loss": 2.2729, "mean_token_accuracy": 0.5383491784334182, "num_tokens": 3166982.0, "step": 7220 }, { "epoch": 2.163375224416517, "grad_norm": 58.75, "learning_rate": 1.3948733293437064e-05, "loss": 1.6851, "mean_token_accuracy": 0.6350438117980957, "num_tokens": 3170364.0, "step": 7230 }, { "epoch": 2.166367444643926, "grad_norm": 33.25, "learning_rate": 1.3898862956313585e-05, "loss": 1.8358, "mean_token_accuracy": 0.6009310275316239, "num_tokens": 3173578.0, "step": 7240 }, { "epoch": 2.1693596648713345, "grad_norm": 26.625, "learning_rate": 1.3848992619190108e-05, "loss": 2.528, "mean_token_accuracy": 0.507240629196167, "num_tokens": 3178958.0, "step": 7250 }, { "epoch": 2.172351885098743, "grad_norm": 32.5, "learning_rate": 1.3799122282066627e-05, "loss": 1.7139, "mean_token_accuracy": 0.6379852175712586, "num_tokens": 3182457.0, "step": 7260 }, { "epoch": 2.175344105326152, "grad_norm": 73.0, "learning_rate": 1.3749251944943147e-05, "loss": 1.7549, "mean_token_accuracy": 0.6359315663576126, "num_tokens": 3186873.0, "step": 7270 }, { "epoch": 2.178336325553561, "grad_norm": 26.0, "learning_rate": 1.369938160781967e-05, "loss": 1.7649, "mean_token_accuracy": 0.6203795820474625, "num_tokens": 3190376.0, "step": 7280 }, { "epoch": 2.1813285457809695, "grad_norm": 15.75, "learning_rate": 1.364951127069619e-05, "loss": 1.8914, "mean_token_accuracy": 0.6036392211914062, "num_tokens": 3195318.0, "step": 7290 }, { "epoch": 2.184320766008378, "grad_norm": 49.0, "learning_rate": 1.359964093357271e-05, "loss": 1.9615, "mean_token_accuracy": 0.6003385275602341, "num_tokens": 3199389.0, "step": 7300 }, { "epoch": 2.1873129862357867, "grad_norm": 25.75, "learning_rate": 1.3549770596449232e-05, "loss": 2.3876, "mean_token_accuracy": 0.5285185039043426, "num_tokens": 3204839.0, "step": 7310 }, { "epoch": 2.190305206463196, "grad_norm": 30.25, "learning_rate": 1.3499900259325755e-05, "loss": 1.8095, "mean_token_accuracy": 0.6179131418466568, "num_tokens": 3208534.0, "step": 7320 }, { "epoch": 2.1932974266906045, "grad_norm": 48.0, "learning_rate": 1.3450029922202276e-05, "loss": 2.143, "mean_token_accuracy": 0.5466442197561264, "num_tokens": 3213107.0, "step": 7330 }, { "epoch": 2.196289646918013, "grad_norm": 19.0, "learning_rate": 1.3400159585078795e-05, "loss": 2.1544, "mean_token_accuracy": 0.5515126138925552, "num_tokens": 3217204.0, "step": 7340 }, { "epoch": 2.1992818671454217, "grad_norm": 30.25, "learning_rate": 1.3350289247955317e-05, "loss": 1.325, "mean_token_accuracy": 0.6974451810121536, "num_tokens": 3222040.0, "step": 7350 }, { "epoch": 2.202274087372831, "grad_norm": 27.125, "learning_rate": 1.3300418910831838e-05, "loss": 2.1716, "mean_token_accuracy": 0.5470889180898666, "num_tokens": 3227199.0, "step": 7360 }, { "epoch": 2.2052663076002395, "grad_norm": 27.375, "learning_rate": 1.325054857370836e-05, "loss": 2.3477, "mean_token_accuracy": 0.5298527553677559, "num_tokens": 3231808.0, "step": 7370 }, { "epoch": 2.208258527827648, "grad_norm": 46.25, "learning_rate": 1.320067823658488e-05, "loss": 1.5753, "mean_token_accuracy": 0.6611310541629791, "num_tokens": 3235862.0, "step": 7380 }, { "epoch": 2.2112507480550567, "grad_norm": 29.875, "learning_rate": 1.31508078994614e-05, "loss": 1.6568, "mean_token_accuracy": 0.6521678894758225, "num_tokens": 3239945.0, "step": 7390 }, { "epoch": 2.2142429682824654, "grad_norm": 54.25, "learning_rate": 1.3100937562337923e-05, "loss": 1.7918, "mean_token_accuracy": 0.6133479118347168, "num_tokens": 3243822.0, "step": 7400 }, { "epoch": 2.2172351885098744, "grad_norm": 43.0, "learning_rate": 1.3051067225214442e-05, "loss": 1.7599, "mean_token_accuracy": 0.6251044929027557, "num_tokens": 3248413.0, "step": 7410 }, { "epoch": 2.220227408737283, "grad_norm": 32.5, "learning_rate": 1.3001196888090963e-05, "loss": 2.4222, "mean_token_accuracy": 0.5013878315687179, "num_tokens": 3252864.0, "step": 7420 }, { "epoch": 2.2232196289646917, "grad_norm": 25.0, "learning_rate": 1.2951326550967485e-05, "loss": 1.9676, "mean_token_accuracy": 0.5924913167953492, "num_tokens": 3258877.0, "step": 7430 }, { "epoch": 2.2262118491921004, "grad_norm": 32.25, "learning_rate": 1.2901456213844008e-05, "loss": 2.1059, "mean_token_accuracy": 0.5556224346160888, "num_tokens": 3263966.0, "step": 7440 }, { "epoch": 2.2292040694195094, "grad_norm": 29.75, "learning_rate": 1.2851585876720527e-05, "loss": 1.6945, "mean_token_accuracy": 0.6180789291858673, "num_tokens": 3268942.0, "step": 7450 }, { "epoch": 2.232196289646918, "grad_norm": 18.75, "learning_rate": 1.2801715539597048e-05, "loss": 1.9998, "mean_token_accuracy": 0.5755996882915497, "num_tokens": 3273260.0, "step": 7460 }, { "epoch": 2.2351885098743267, "grad_norm": 38.0, "learning_rate": 1.275184520247357e-05, "loss": 2.0105, "mean_token_accuracy": 0.5683426201343537, "num_tokens": 3277591.0, "step": 7470 }, { "epoch": 2.2381807301017353, "grad_norm": 33.25, "learning_rate": 1.2701974865350091e-05, "loss": 2.1261, "mean_token_accuracy": 0.5513424545526504, "num_tokens": 3280618.0, "step": 7480 }, { "epoch": 2.2411729503291444, "grad_norm": 31.75, "learning_rate": 1.265210452822661e-05, "loss": 1.9736, "mean_token_accuracy": 0.58391652405262, "num_tokens": 3285301.0, "step": 7490 }, { "epoch": 2.244165170556553, "grad_norm": 26.5, "learning_rate": 1.2602234191103132e-05, "loss": 2.2189, "mean_token_accuracy": 0.5404590010643006, "num_tokens": 3290313.0, "step": 7500 }, { "epoch": 2.2471573907839617, "grad_norm": 32.0, "learning_rate": 1.2552363853979653e-05, "loss": 1.7485, "mean_token_accuracy": 0.6247122138738632, "num_tokens": 3294690.0, "step": 7510 }, { "epoch": 2.2501496110113703, "grad_norm": 18.0, "learning_rate": 1.2502493516856176e-05, "loss": 2.2453, "mean_token_accuracy": 0.544939911365509, "num_tokens": 3299559.0, "step": 7520 }, { "epoch": 2.253141831238779, "grad_norm": 26.5, "learning_rate": 1.2452623179732697e-05, "loss": 1.9581, "mean_token_accuracy": 0.6086787462234498, "num_tokens": 3305592.0, "step": 7530 }, { "epoch": 2.256134051466188, "grad_norm": 26.0, "learning_rate": 1.2402752842609217e-05, "loss": 1.8021, "mean_token_accuracy": 0.6123438656330109, "num_tokens": 3310369.0, "step": 7540 }, { "epoch": 2.2591262716935967, "grad_norm": 37.25, "learning_rate": 1.2352882505485738e-05, "loss": 2.3046, "mean_token_accuracy": 0.5472021579742432, "num_tokens": 3314477.0, "step": 7550 }, { "epoch": 2.2621184919210053, "grad_norm": 23.25, "learning_rate": 1.2303012168362259e-05, "loss": 2.0173, "mean_token_accuracy": 0.5664256989955903, "num_tokens": 3318450.0, "step": 7560 }, { "epoch": 2.265110712148414, "grad_norm": 14.9375, "learning_rate": 1.225314183123878e-05, "loss": 2.0863, "mean_token_accuracy": 0.5759825021028518, "num_tokens": 3323229.0, "step": 7570 }, { "epoch": 2.2681029323758226, "grad_norm": 17.875, "learning_rate": 1.22032714941153e-05, "loss": 2.7287, "mean_token_accuracy": 0.497147136926651, "num_tokens": 3327917.0, "step": 7580 }, { "epoch": 2.2710951526032317, "grad_norm": 12.1875, "learning_rate": 1.2153401156991821e-05, "loss": 1.8352, "mean_token_accuracy": 0.6011588901281357, "num_tokens": 3331229.0, "step": 7590 }, { "epoch": 2.2740873728306403, "grad_norm": 19.125, "learning_rate": 1.2103530819868344e-05, "loss": 1.8881, "mean_token_accuracy": 0.610486413538456, "num_tokens": 3335337.0, "step": 7600 }, { "epoch": 2.277079593058049, "grad_norm": 29.375, "learning_rate": 1.2053660482744863e-05, "loss": 2.0414, "mean_token_accuracy": 0.5699247926473617, "num_tokens": 3339984.0, "step": 7610 }, { "epoch": 2.280071813285458, "grad_norm": 46.25, "learning_rate": 1.2003790145621385e-05, "loss": 2.2427, "mean_token_accuracy": 0.5332239389419555, "num_tokens": 3343756.0, "step": 7620 }, { "epoch": 2.2830640335128667, "grad_norm": 40.5, "learning_rate": 1.1953919808497906e-05, "loss": 2.0443, "mean_token_accuracy": 0.5590986162424088, "num_tokens": 3347489.0, "step": 7630 }, { "epoch": 2.2860562537402753, "grad_norm": 29.75, "learning_rate": 1.1904049471374427e-05, "loss": 1.7902, "mean_token_accuracy": 0.621918460726738, "num_tokens": 3352828.0, "step": 7640 }, { "epoch": 2.289048473967684, "grad_norm": 37.75, "learning_rate": 1.1854179134250948e-05, "loss": 2.136, "mean_token_accuracy": 0.5378258466720581, "num_tokens": 3356433.0, "step": 7650 }, { "epoch": 2.2920406941950926, "grad_norm": 27.75, "learning_rate": 1.180430879712747e-05, "loss": 1.5546, "mean_token_accuracy": 0.6686955988407135, "num_tokens": 3362086.0, "step": 7660 }, { "epoch": 2.2950329144225017, "grad_norm": 39.5, "learning_rate": 1.175443846000399e-05, "loss": 1.9859, "mean_token_accuracy": 0.5787241399288178, "num_tokens": 3366370.0, "step": 7670 }, { "epoch": 2.2980251346499103, "grad_norm": 59.75, "learning_rate": 1.1704568122880512e-05, "loss": 1.5931, "mean_token_accuracy": 0.6556290119886399, "num_tokens": 3369998.0, "step": 7680 }, { "epoch": 2.301017354877319, "grad_norm": 64.0, "learning_rate": 1.1654697785757033e-05, "loss": 2.3505, "mean_token_accuracy": 0.530811858177185, "num_tokens": 3374677.0, "step": 7690 }, { "epoch": 2.3040095751047276, "grad_norm": 31.875, "learning_rate": 1.1604827448633553e-05, "loss": 2.0854, "mean_token_accuracy": 0.5914190322160721, "num_tokens": 3380055.0, "step": 7700 }, { "epoch": 2.3070017953321367, "grad_norm": 18.375, "learning_rate": 1.1554957111510074e-05, "loss": 1.8562, "mean_token_accuracy": 0.601532056927681, "num_tokens": 3383671.0, "step": 7710 }, { "epoch": 2.3099940155595453, "grad_norm": 30.875, "learning_rate": 1.1505086774386595e-05, "loss": 1.6012, "mean_token_accuracy": 0.6378510087728501, "num_tokens": 3387927.0, "step": 7720 }, { "epoch": 2.312986235786954, "grad_norm": 27.625, "learning_rate": 1.1455216437263116e-05, "loss": 2.6541, "mean_token_accuracy": 0.4840825363993645, "num_tokens": 3391989.0, "step": 7730 }, { "epoch": 2.3159784560143626, "grad_norm": 34.25, "learning_rate": 1.1405346100139637e-05, "loss": 1.9968, "mean_token_accuracy": 0.6017626792192459, "num_tokens": 3397104.0, "step": 7740 }, { "epoch": 2.3189706762417712, "grad_norm": 34.5, "learning_rate": 1.1355475763016159e-05, "loss": 1.9871, "mean_token_accuracy": 0.5988510757684707, "num_tokens": 3402355.0, "step": 7750 }, { "epoch": 2.3219628964691803, "grad_norm": 34.5, "learning_rate": 1.130560542589268e-05, "loss": 1.8842, "mean_token_accuracy": 0.6055288344621659, "num_tokens": 3405790.0, "step": 7760 }, { "epoch": 2.324955116696589, "grad_norm": 35.5, "learning_rate": 1.12557350887692e-05, "loss": 1.9169, "mean_token_accuracy": 0.595410618185997, "num_tokens": 3409449.0, "step": 7770 }, { "epoch": 2.3279473369239976, "grad_norm": 25.375, "learning_rate": 1.1205864751645721e-05, "loss": 1.7218, "mean_token_accuracy": 0.6117983996868134, "num_tokens": 3414393.0, "step": 7780 }, { "epoch": 2.330939557151406, "grad_norm": 32.5, "learning_rate": 1.1155994414522242e-05, "loss": 2.7433, "mean_token_accuracy": 0.46884743869304657, "num_tokens": 3418163.0, "step": 7790 }, { "epoch": 2.3339317773788153, "grad_norm": 44.25, "learning_rate": 1.1106124077398763e-05, "loss": 2.2331, "mean_token_accuracy": 0.5329806178808212, "num_tokens": 3422058.0, "step": 7800 }, { "epoch": 2.336923997606224, "grad_norm": 32.5, "learning_rate": 1.1056253740275286e-05, "loss": 2.0709, "mean_token_accuracy": 0.580215010046959, "num_tokens": 3426116.0, "step": 7810 }, { "epoch": 2.3399162178336326, "grad_norm": 21.625, "learning_rate": 1.1006383403151806e-05, "loss": 2.2106, "mean_token_accuracy": 0.5685110569000245, "num_tokens": 3430160.0, "step": 7820 }, { "epoch": 2.342908438061041, "grad_norm": 38.0, "learning_rate": 1.0956513066028327e-05, "loss": 2.1967, "mean_token_accuracy": 0.5463057830929756, "num_tokens": 3434026.0, "step": 7830 }, { "epoch": 2.34590065828845, "grad_norm": 61.25, "learning_rate": 1.0906642728904848e-05, "loss": 1.9145, "mean_token_accuracy": 0.5919240117073059, "num_tokens": 3438573.0, "step": 7840 }, { "epoch": 2.348892878515859, "grad_norm": 33.25, "learning_rate": 1.085677239178137e-05, "loss": 2.443, "mean_token_accuracy": 0.5153850048780442, "num_tokens": 3444090.0, "step": 7850 }, { "epoch": 2.3518850987432676, "grad_norm": 25.625, "learning_rate": 1.080690205465789e-05, "loss": 1.6518, "mean_token_accuracy": 0.6537412971258163, "num_tokens": 3447767.0, "step": 7860 }, { "epoch": 2.354877318970676, "grad_norm": 39.5, "learning_rate": 1.075703171753441e-05, "loss": 2.2954, "mean_token_accuracy": 0.5424281567335129, "num_tokens": 3451267.0, "step": 7870 }, { "epoch": 2.357869539198085, "grad_norm": 50.25, "learning_rate": 1.0707161380410933e-05, "loss": 2.0823, "mean_token_accuracy": 0.5709426254034042, "num_tokens": 3455425.0, "step": 7880 }, { "epoch": 2.360861759425494, "grad_norm": 25.625, "learning_rate": 1.0657291043287452e-05, "loss": 2.2418, "mean_token_accuracy": 0.5553388357162475, "num_tokens": 3459713.0, "step": 7890 }, { "epoch": 2.3638539796529026, "grad_norm": 23.875, "learning_rate": 1.0607420706163974e-05, "loss": 2.0082, "mean_token_accuracy": 0.6014505207538605, "num_tokens": 3463321.0, "step": 7900 }, { "epoch": 2.366846199880311, "grad_norm": 25.5, "learning_rate": 1.0557550369040495e-05, "loss": 1.8273, "mean_token_accuracy": 0.6049247473478317, "num_tokens": 3468143.0, "step": 7910 }, { "epoch": 2.36983842010772, "grad_norm": 22.625, "learning_rate": 1.0507680031917016e-05, "loss": 1.736, "mean_token_accuracy": 0.6522809594869614, "num_tokens": 3471017.0, "step": 7920 }, { "epoch": 2.3728306403351285, "grad_norm": 43.5, "learning_rate": 1.0457809694793537e-05, "loss": 1.9577, "mean_token_accuracy": 0.5918497428297996, "num_tokens": 3475374.0, "step": 7930 }, { "epoch": 2.3758228605625376, "grad_norm": 20.125, "learning_rate": 1.040793935767006e-05, "loss": 2.023, "mean_token_accuracy": 0.5746719360351562, "num_tokens": 3480701.0, "step": 7940 }, { "epoch": 2.378815080789946, "grad_norm": 27.5, "learning_rate": 1.0358069020546578e-05, "loss": 2.4225, "mean_token_accuracy": 0.5121300339698791, "num_tokens": 3484654.0, "step": 7950 }, { "epoch": 2.381807301017355, "grad_norm": 49.25, "learning_rate": 1.03081986834231e-05, "loss": 2.5751, "mean_token_accuracy": 0.5052287340164184, "num_tokens": 3488371.0, "step": 7960 }, { "epoch": 2.3847995212447635, "grad_norm": 51.75, "learning_rate": 1.0258328346299622e-05, "loss": 1.4107, "mean_token_accuracy": 0.6880755305290223, "num_tokens": 3493507.0, "step": 7970 }, { "epoch": 2.3877917414721725, "grad_norm": 17.375, "learning_rate": 1.0208458009176142e-05, "loss": 2.5548, "mean_token_accuracy": 0.4705794721841812, "num_tokens": 3498623.0, "step": 7980 }, { "epoch": 2.390783961699581, "grad_norm": 27.875, "learning_rate": 1.0158587672052663e-05, "loss": 1.909, "mean_token_accuracy": 0.6068336576223373, "num_tokens": 3502592.0, "step": 7990 }, { "epoch": 2.39377618192699, "grad_norm": 9.8125, "learning_rate": 1.0108717334929186e-05, "loss": 2.1661, "mean_token_accuracy": 0.5623240023851395, "num_tokens": 3507098.0, "step": 8000 }, { "epoch": 2.3967684021543985, "grad_norm": 33.0, "learning_rate": 1.0058846997805706e-05, "loss": 1.8429, "mean_token_accuracy": 0.6223942905664444, "num_tokens": 3511321.0, "step": 8010 }, { "epoch": 2.399760622381807, "grad_norm": 30.625, "learning_rate": 1.0008976660682226e-05, "loss": 1.8434, "mean_token_accuracy": 0.6259656488895416, "num_tokens": 3516657.0, "step": 8020 }, { "epoch": 2.402752842609216, "grad_norm": 27.25, "learning_rate": 9.959106323558748e-06, "loss": 1.9366, "mean_token_accuracy": 0.5986247807741165, "num_tokens": 3520858.0, "step": 8030 }, { "epoch": 2.405745062836625, "grad_norm": 19.0, "learning_rate": 9.909235986435269e-06, "loss": 1.5475, "mean_token_accuracy": 0.662953433394432, "num_tokens": 3525453.0, "step": 8040 }, { "epoch": 2.4087372830640335, "grad_norm": 51.25, "learning_rate": 9.85936564931179e-06, "loss": 1.894, "mean_token_accuracy": 0.6088249057531356, "num_tokens": 3529847.0, "step": 8050 }, { "epoch": 2.411729503291442, "grad_norm": 18.375, "learning_rate": 9.80949531218831e-06, "loss": 2.0747, "mean_token_accuracy": 0.5732029706239701, "num_tokens": 3535537.0, "step": 8060 }, { "epoch": 2.414721723518851, "grad_norm": 16.625, "learning_rate": 9.759624975064833e-06, "loss": 2.1334, "mean_token_accuracy": 0.5579861745238304, "num_tokens": 3539446.0, "step": 8070 }, { "epoch": 2.41771394374626, "grad_norm": 12.625, "learning_rate": 9.709754637941352e-06, "loss": 2.0315, "mean_token_accuracy": 0.5979110836982727, "num_tokens": 3542930.0, "step": 8080 }, { "epoch": 2.4207061639736684, "grad_norm": 19.25, "learning_rate": 9.659884300817875e-06, "loss": 1.7469, "mean_token_accuracy": 0.6296554863452911, "num_tokens": 3546756.0, "step": 8090 }, { "epoch": 2.423698384201077, "grad_norm": 37.0, "learning_rate": 9.610013963694395e-06, "loss": 1.7101, "mean_token_accuracy": 0.6294068902730942, "num_tokens": 3550808.0, "step": 8100 }, { "epoch": 2.4266906044284857, "grad_norm": 44.25, "learning_rate": 9.560143626570916e-06, "loss": 1.8353, "mean_token_accuracy": 0.6092359751462937, "num_tokens": 3554987.0, "step": 8110 }, { "epoch": 2.429682824655895, "grad_norm": 37.5, "learning_rate": 9.510273289447437e-06, "loss": 2.305, "mean_token_accuracy": 0.5346463292837143, "num_tokens": 3559276.0, "step": 8120 }, { "epoch": 2.4326750448833034, "grad_norm": 41.0, "learning_rate": 9.46040295232396e-06, "loss": 2.0357, "mean_token_accuracy": 0.5618094027042388, "num_tokens": 3563553.0, "step": 8130 }, { "epoch": 2.435667265110712, "grad_norm": 28.625, "learning_rate": 9.410532615200478e-06, "loss": 1.8299, "mean_token_accuracy": 0.6292552351951599, "num_tokens": 3567569.0, "step": 8140 }, { "epoch": 2.4386594853381207, "grad_norm": 27.375, "learning_rate": 9.360662278077001e-06, "loss": 1.6487, "mean_token_accuracy": 0.6505157589912415, "num_tokens": 3572902.0, "step": 8150 }, { "epoch": 2.44165170556553, "grad_norm": 42.0, "learning_rate": 9.310791940953522e-06, "loss": 1.9738, "mean_token_accuracy": 0.5727031499147415, "num_tokens": 3577788.0, "step": 8160 }, { "epoch": 2.4446439257929384, "grad_norm": 26.75, "learning_rate": 9.260921603830041e-06, "loss": 1.6068, "mean_token_accuracy": 0.6601596623659134, "num_tokens": 3582166.0, "step": 8170 }, { "epoch": 2.447636146020347, "grad_norm": 25.875, "learning_rate": 9.211051266706563e-06, "loss": 2.1705, "mean_token_accuracy": 0.5626641541719437, "num_tokens": 3587132.0, "step": 8180 }, { "epoch": 2.4506283662477557, "grad_norm": 33.75, "learning_rate": 9.161180929583084e-06, "loss": 1.7271, "mean_token_accuracy": 0.6523958176374436, "num_tokens": 3592763.0, "step": 8190 }, { "epoch": 2.4536205864751643, "grad_norm": 23.625, "learning_rate": 9.111310592459605e-06, "loss": 2.0387, "mean_token_accuracy": 0.595306721329689, "num_tokens": 3597038.0, "step": 8200 }, { "epoch": 2.4566128067025734, "grad_norm": 26.75, "learning_rate": 9.061440255336126e-06, "loss": 2.464, "mean_token_accuracy": 0.516951784491539, "num_tokens": 3602367.0, "step": 8210 }, { "epoch": 2.459605026929982, "grad_norm": 32.0, "learning_rate": 9.011569918212648e-06, "loss": 1.6953, "mean_token_accuracy": 0.6542330086231232, "num_tokens": 3606253.0, "step": 8220 }, { "epoch": 2.4625972471573907, "grad_norm": 17.75, "learning_rate": 8.961699581089169e-06, "loss": 1.2013, "mean_token_accuracy": 0.7238958954811097, "num_tokens": 3612181.0, "step": 8230 }, { "epoch": 2.4655894673847993, "grad_norm": 28.0, "learning_rate": 8.91182924396569e-06, "loss": 2.1341, "mean_token_accuracy": 0.5624263644218445, "num_tokens": 3616098.0, "step": 8240 }, { "epoch": 2.4685816876122084, "grad_norm": 43.0, "learning_rate": 8.86195890684221e-06, "loss": 2.1282, "mean_token_accuracy": 0.5523239612579346, "num_tokens": 3620981.0, "step": 8250 }, { "epoch": 2.471573907839617, "grad_norm": 29.25, "learning_rate": 8.812088569718731e-06, "loss": 2.1556, "mean_token_accuracy": 0.5922388851642608, "num_tokens": 3627070.0, "step": 8260 }, { "epoch": 2.4745661280670257, "grad_norm": 29.875, "learning_rate": 8.762218232595252e-06, "loss": 2.3435, "mean_token_accuracy": 0.5244899958372116, "num_tokens": 3631458.0, "step": 8270 }, { "epoch": 2.4775583482944343, "grad_norm": 41.75, "learning_rate": 8.712347895471775e-06, "loss": 2.0722, "mean_token_accuracy": 0.5771478354930878, "num_tokens": 3635862.0, "step": 8280 }, { "epoch": 2.480550568521843, "grad_norm": 29.75, "learning_rate": 8.662477558348295e-06, "loss": 1.9438, "mean_token_accuracy": 0.5872433006763458, "num_tokens": 3639631.0, "step": 8290 }, { "epoch": 2.483542788749252, "grad_norm": 27.125, "learning_rate": 8.612607221224816e-06, "loss": 1.4937, "mean_token_accuracy": 0.6888282895088196, "num_tokens": 3643726.0, "step": 8300 }, { "epoch": 2.4865350089766607, "grad_norm": 31.25, "learning_rate": 8.562736884101337e-06, "loss": 2.3963, "mean_token_accuracy": 0.5211231708526611, "num_tokens": 3647701.0, "step": 8310 }, { "epoch": 2.4895272292040693, "grad_norm": 74.5, "learning_rate": 8.512866546977858e-06, "loss": 1.8017, "mean_token_accuracy": 0.6249741375446319, "num_tokens": 3651854.0, "step": 8320 }, { "epoch": 2.492519449431478, "grad_norm": 20.375, "learning_rate": 8.462996209854379e-06, "loss": 2.1971, "mean_token_accuracy": 0.5514743745326995, "num_tokens": 3655610.0, "step": 8330 }, { "epoch": 2.495511669658887, "grad_norm": 60.25, "learning_rate": 8.4131258727309e-06, "loss": 2.1792, "mean_token_accuracy": 0.5621172040700912, "num_tokens": 3659872.0, "step": 8340 }, { "epoch": 2.4985038898862957, "grad_norm": 37.75, "learning_rate": 8.363255535607422e-06, "loss": 2.0568, "mean_token_accuracy": 0.5745551347732544, "num_tokens": 3664216.0, "step": 8350 }, { "epoch": 2.5014961101137043, "grad_norm": 33.5, "learning_rate": 8.313385198483941e-06, "loss": 1.8874, "mean_token_accuracy": 0.6061397522687912, "num_tokens": 3668880.0, "step": 8360 }, { "epoch": 2.504488330341113, "grad_norm": 32.75, "learning_rate": 8.263514861360464e-06, "loss": 1.9858, "mean_token_accuracy": 0.5864089488983154, "num_tokens": 3675294.0, "step": 8370 }, { "epoch": 2.5074805505685216, "grad_norm": 40.75, "learning_rate": 8.213644524236984e-06, "loss": 1.5634, "mean_token_accuracy": 0.6630536049604416, "num_tokens": 3680712.0, "step": 8380 }, { "epoch": 2.5104727707959307, "grad_norm": 28.5, "learning_rate": 8.163774187113505e-06, "loss": 1.8208, "mean_token_accuracy": 0.6154579162597656, "num_tokens": 3685294.0, "step": 8390 }, { "epoch": 2.5134649910233393, "grad_norm": 49.0, "learning_rate": 8.113903849990026e-06, "loss": 2.1475, "mean_token_accuracy": 0.562446939945221, "num_tokens": 3688792.0, "step": 8400 }, { "epoch": 2.516457211250748, "grad_norm": 31.625, "learning_rate": 8.064033512866548e-06, "loss": 2.1782, "mean_token_accuracy": 0.5494516760110855, "num_tokens": 3693374.0, "step": 8410 }, { "epoch": 2.519449431478157, "grad_norm": 42.75, "learning_rate": 8.014163175743067e-06, "loss": 1.516, "mean_token_accuracy": 0.6680077582597732, "num_tokens": 3697590.0, "step": 8420 }, { "epoch": 2.5224416517055657, "grad_norm": 36.25, "learning_rate": 7.96429283861959e-06, "loss": 2.8254, "mean_token_accuracy": 0.45352172702550886, "num_tokens": 3702193.0, "step": 8430 }, { "epoch": 2.5254338719329743, "grad_norm": 50.75, "learning_rate": 7.91442250149611e-06, "loss": 1.8653, "mean_token_accuracy": 0.6249462842941285, "num_tokens": 3706496.0, "step": 8440 }, { "epoch": 2.528426092160383, "grad_norm": 11.0, "learning_rate": 7.864552164372632e-06, "loss": 1.8116, "mean_token_accuracy": 0.6244892299175262, "num_tokens": 3710279.0, "step": 8450 }, { "epoch": 2.5314183123877916, "grad_norm": 38.75, "learning_rate": 7.814681827249152e-06, "loss": 1.8314, "mean_token_accuracy": 0.631192871928215, "num_tokens": 3713786.0, "step": 8460 }, { "epoch": 2.5344105326152, "grad_norm": 22.0, "learning_rate": 7.764811490125673e-06, "loss": 1.9399, "mean_token_accuracy": 0.5822267323732376, "num_tokens": 3718329.0, "step": 8470 }, { "epoch": 2.5374027528426093, "grad_norm": 22.0, "learning_rate": 7.714941153002194e-06, "loss": 2.0836, "mean_token_accuracy": 0.5822148770093918, "num_tokens": 3722215.0, "step": 8480 }, { "epoch": 2.540394973070018, "grad_norm": 22.875, "learning_rate": 7.665070815878715e-06, "loss": 1.8375, "mean_token_accuracy": 0.6172699481248856, "num_tokens": 3725939.0, "step": 8490 }, { "epoch": 2.5433871932974266, "grad_norm": 24.75, "learning_rate": 7.615200478755237e-06, "loss": 1.6583, "mean_token_accuracy": 0.6369618535041809, "num_tokens": 3730746.0, "step": 8500 }, { "epoch": 2.5463794135248357, "grad_norm": 40.25, "learning_rate": 7.565330141631757e-06, "loss": 2.1574, "mean_token_accuracy": 0.5527519911527634, "num_tokens": 3734695.0, "step": 8510 }, { "epoch": 2.5493716337522443, "grad_norm": 28.875, "learning_rate": 7.515459804508279e-06, "loss": 1.6126, "mean_token_accuracy": 0.650888466835022, "num_tokens": 3738514.0, "step": 8520 }, { "epoch": 2.552363853979653, "grad_norm": 21.125, "learning_rate": 7.4655894673848e-06, "loss": 2.3758, "mean_token_accuracy": 0.5187936812639237, "num_tokens": 3743566.0, "step": 8530 }, { "epoch": 2.5553560742070616, "grad_norm": 21.625, "learning_rate": 7.415719130261321e-06, "loss": 2.1145, "mean_token_accuracy": 0.5670811682939529, "num_tokens": 3747765.0, "step": 8540 }, { "epoch": 2.55834829443447, "grad_norm": 20.625, "learning_rate": 7.365848793137842e-06, "loss": 1.8283, "mean_token_accuracy": 0.6210377246141434, "num_tokens": 3752107.0, "step": 8550 }, { "epoch": 2.561340514661879, "grad_norm": 29.125, "learning_rate": 7.315978456014364e-06, "loss": 1.8758, "mean_token_accuracy": 0.6091048032045364, "num_tokens": 3756006.0, "step": 8560 }, { "epoch": 2.564332734889288, "grad_norm": 24.0, "learning_rate": 7.266108118890884e-06, "loss": 2.0391, "mean_token_accuracy": 0.576360335946083, "num_tokens": 3759916.0, "step": 8570 }, { "epoch": 2.5673249551166966, "grad_norm": 64.5, "learning_rate": 7.216237781767405e-06, "loss": 2.1935, "mean_token_accuracy": 0.5540457516908646, "num_tokens": 3763119.0, "step": 8580 }, { "epoch": 2.570317175344105, "grad_norm": 29.0, "learning_rate": 7.166367444643926e-06, "loss": 1.4691, "mean_token_accuracy": 0.6710379928350448, "num_tokens": 3768397.0, "step": 8590 }, { "epoch": 2.5733093955715143, "grad_norm": 19.375, "learning_rate": 7.116497107520448e-06, "loss": 1.8466, "mean_token_accuracy": 0.6215602785348893, "num_tokens": 3772852.0, "step": 8600 }, { "epoch": 2.576301615798923, "grad_norm": 29.125, "learning_rate": 7.0666267703969685e-06, "loss": 1.796, "mean_token_accuracy": 0.6303971827030181, "num_tokens": 3777531.0, "step": 8610 }, { "epoch": 2.5792938360263316, "grad_norm": 39.25, "learning_rate": 7.0167564332734884e-06, "loss": 1.7612, "mean_token_accuracy": 0.6404202848672866, "num_tokens": 3781420.0, "step": 8620 }, { "epoch": 2.58228605625374, "grad_norm": 23.125, "learning_rate": 6.96688609615001e-06, "loss": 2.5314, "mean_token_accuracy": 0.49140324890613557, "num_tokens": 3783768.0, "step": 8630 }, { "epoch": 2.585278276481149, "grad_norm": 29.5, "learning_rate": 6.917015759026531e-06, "loss": 2.1956, "mean_token_accuracy": 0.5796375393867492, "num_tokens": 3786827.0, "step": 8640 }, { "epoch": 2.588270496708558, "grad_norm": 35.0, "learning_rate": 6.8671454219030525e-06, "loss": 2.2291, "mean_token_accuracy": 0.5234993904829025, "num_tokens": 3790613.0, "step": 8650 }, { "epoch": 2.5912627169359665, "grad_norm": 25.0, "learning_rate": 6.817275084779573e-06, "loss": 2.9253, "mean_token_accuracy": 0.44142225980758665, "num_tokens": 3796033.0, "step": 8660 }, { "epoch": 2.594254937163375, "grad_norm": 33.25, "learning_rate": 6.767404747656095e-06, "loss": 2.8558, "mean_token_accuracy": 0.46484392285346987, "num_tokens": 3799913.0, "step": 8670 }, { "epoch": 2.597247157390784, "grad_norm": 25.0, "learning_rate": 6.717534410532615e-06, "loss": 2.0318, "mean_token_accuracy": 0.5812970370054245, "num_tokens": 3804951.0, "step": 8680 }, { "epoch": 2.600239377618193, "grad_norm": 37.75, "learning_rate": 6.6676640734091365e-06, "loss": 1.9003, "mean_token_accuracy": 0.6071935892105103, "num_tokens": 3808152.0, "step": 8690 }, { "epoch": 2.6032315978456015, "grad_norm": 24.75, "learning_rate": 6.617793736285657e-06, "loss": 1.9307, "mean_token_accuracy": 0.6070375055074692, "num_tokens": 3811382.0, "step": 8700 }, { "epoch": 2.60622381807301, "grad_norm": 25.625, "learning_rate": 6.567923399162179e-06, "loss": 1.6272, "mean_token_accuracy": 0.6634323447942734, "num_tokens": 3816044.0, "step": 8710 }, { "epoch": 2.609216038300419, "grad_norm": 29.625, "learning_rate": 6.5180530620387e-06, "loss": 2.0368, "mean_token_accuracy": 0.5823567092418671, "num_tokens": 3820355.0, "step": 8720 }, { "epoch": 2.6122082585278275, "grad_norm": 28.375, "learning_rate": 6.468182724915221e-06, "loss": 2.2438, "mean_token_accuracy": 0.5450930893421173, "num_tokens": 3824214.0, "step": 8730 }, { "epoch": 2.6152004787552365, "grad_norm": 36.75, "learning_rate": 6.418312387791741e-06, "loss": 1.9044, "mean_token_accuracy": 0.6274087101221084, "num_tokens": 3829582.0, "step": 8740 }, { "epoch": 2.618192698982645, "grad_norm": 34.25, "learning_rate": 6.368442050668264e-06, "loss": 2.3823, "mean_token_accuracy": 0.522585716843605, "num_tokens": 3834310.0, "step": 8750 }, { "epoch": 2.621184919210054, "grad_norm": 14.0625, "learning_rate": 6.318571713544784e-06, "loss": 2.1984, "mean_token_accuracy": 0.5534607917070389, "num_tokens": 3841092.0, "step": 8760 }, { "epoch": 2.6241771394374624, "grad_norm": 36.0, "learning_rate": 6.268701376421305e-06, "loss": 1.5606, "mean_token_accuracy": 0.6726154178380966, "num_tokens": 3846284.0, "step": 8770 }, { "epoch": 2.6271693596648715, "grad_norm": 31.625, "learning_rate": 6.218831039297826e-06, "loss": 1.9837, "mean_token_accuracy": 0.5853519588708878, "num_tokens": 3849669.0, "step": 8780 }, { "epoch": 2.63016157989228, "grad_norm": 33.5, "learning_rate": 6.168960702174347e-06, "loss": 1.4509, "mean_token_accuracy": 0.6898014098405838, "num_tokens": 3852892.0, "step": 8790 }, { "epoch": 2.633153800119689, "grad_norm": 33.5, "learning_rate": 6.119090365050869e-06, "loss": 1.6132, "mean_token_accuracy": 0.6505110949277878, "num_tokens": 3856161.0, "step": 8800 }, { "epoch": 2.6361460203470974, "grad_norm": 16.25, "learning_rate": 6.0692200279273894e-06, "loss": 2.2723, "mean_token_accuracy": 0.5464581429958344, "num_tokens": 3860799.0, "step": 8810 }, { "epoch": 2.639138240574506, "grad_norm": 12.125, "learning_rate": 6.01934969080391e-06, "loss": 2.0183, "mean_token_accuracy": 0.574837502837181, "num_tokens": 3864947.0, "step": 8820 }, { "epoch": 2.642130460801915, "grad_norm": 22.875, "learning_rate": 5.969479353680431e-06, "loss": 1.6786, "mean_token_accuracy": 0.6457695811986923, "num_tokens": 3869804.0, "step": 8830 }, { "epoch": 2.645122681029324, "grad_norm": 27.75, "learning_rate": 5.919609016556952e-06, "loss": 2.1758, "mean_token_accuracy": 0.5874395340681076, "num_tokens": 3874553.0, "step": 8840 }, { "epoch": 2.6481149012567324, "grad_norm": 32.5, "learning_rate": 5.869738679433473e-06, "loss": 1.8416, "mean_token_accuracy": 0.6180979579687118, "num_tokens": 3878177.0, "step": 8850 }, { "epoch": 2.651107121484141, "grad_norm": 25.875, "learning_rate": 5.819868342309994e-06, "loss": 1.9093, "mean_token_accuracy": 0.6036896035075188, "num_tokens": 3881704.0, "step": 8860 }, { "epoch": 2.65409934171155, "grad_norm": 48.75, "learning_rate": 5.769998005186515e-06, "loss": 1.7578, "mean_token_accuracy": 0.6388221472501755, "num_tokens": 3886023.0, "step": 8870 }, { "epoch": 2.657091561938959, "grad_norm": 51.25, "learning_rate": 5.720127668063036e-06, "loss": 2.7133, "mean_token_accuracy": 0.47807915806770324, "num_tokens": 3890025.0, "step": 8880 }, { "epoch": 2.6600837821663674, "grad_norm": 25.875, "learning_rate": 5.6702573309395575e-06, "loss": 1.9606, "mean_token_accuracy": 0.5969976544380188, "num_tokens": 3893625.0, "step": 8890 }, { "epoch": 2.663076002393776, "grad_norm": 14.5, "learning_rate": 5.620386993816078e-06, "loss": 2.481, "mean_token_accuracy": 0.5044433563947678, "num_tokens": 3899249.0, "step": 8900 }, { "epoch": 2.6660682226211847, "grad_norm": 38.0, "learning_rate": 5.5705166566926e-06, "loss": 1.7464, "mean_token_accuracy": 0.6185297697782517, "num_tokens": 3903839.0, "step": 8910 }, { "epoch": 2.669060442848594, "grad_norm": 36.75, "learning_rate": 5.520646319569121e-06, "loss": 2.305, "mean_token_accuracy": 0.5210951715707779, "num_tokens": 3906730.0, "step": 8920 }, { "epoch": 2.6720526630760024, "grad_norm": 34.0, "learning_rate": 5.4707759824456415e-06, "loss": 1.1306, "mean_token_accuracy": 0.7620425939559936, "num_tokens": 3910169.0, "step": 8930 }, { "epoch": 2.675044883303411, "grad_norm": 28.0, "learning_rate": 5.420905645322163e-06, "loss": 1.8178, "mean_token_accuracy": 0.6252069532871246, "num_tokens": 3914875.0, "step": 8940 }, { "epoch": 2.67803710353082, "grad_norm": 13.625, "learning_rate": 5.371035308198684e-06, "loss": 1.6462, "mean_token_accuracy": 0.6465720534324646, "num_tokens": 3918847.0, "step": 8950 }, { "epoch": 2.6810293237582288, "grad_norm": 44.0, "learning_rate": 5.321164971075205e-06, "loss": 1.9698, "mean_token_accuracy": 0.6158938229084014, "num_tokens": 3921966.0, "step": 8960 }, { "epoch": 2.6840215439856374, "grad_norm": 30.875, "learning_rate": 5.271294633951726e-06, "loss": 2.3922, "mean_token_accuracy": 0.5279428198933601, "num_tokens": 3926081.0, "step": 8970 }, { "epoch": 2.687013764213046, "grad_norm": 30.75, "learning_rate": 5.221424296828246e-06, "loss": 2.0273, "mean_token_accuracy": 0.5862969070672989, "num_tokens": 3931317.0, "step": 8980 }, { "epoch": 2.6900059844404547, "grad_norm": 31.875, "learning_rate": 5.171553959704768e-06, "loss": 2.6512, "mean_token_accuracy": 0.4965745806694031, "num_tokens": 3935938.0, "step": 8990 }, { "epoch": 2.6929982046678633, "grad_norm": 29.875, "learning_rate": 5.121683622581289e-06, "loss": 2.0997, "mean_token_accuracy": 0.5695976376533508, "num_tokens": 3941232.0, "step": 9000 }, { "epoch": 2.6959904248952724, "grad_norm": 22.25, "learning_rate": 5.0718132854578096e-06, "loss": 1.9799, "mean_token_accuracy": 0.5941878288984299, "num_tokens": 3944101.0, "step": 9010 }, { "epoch": 2.698982645122681, "grad_norm": 22.375, "learning_rate": 5.021942948334331e-06, "loss": 1.1311, "mean_token_accuracy": 0.7389927178621292, "num_tokens": 3947397.0, "step": 9020 }, { "epoch": 2.7019748653500897, "grad_norm": 27.0, "learning_rate": 4.972072611210852e-06, "loss": 1.8437, "mean_token_accuracy": 0.620136833190918, "num_tokens": 3953487.0, "step": 9030 }, { "epoch": 2.7049670855774988, "grad_norm": 53.0, "learning_rate": 4.922202274087373e-06, "loss": 2.3224, "mean_token_accuracy": 0.5359113857150077, "num_tokens": 3957159.0, "step": 9040 }, { "epoch": 2.7079593058049074, "grad_norm": 29.0, "learning_rate": 4.872331936963894e-06, "loss": 1.6784, "mean_token_accuracy": 0.6427404046058655, "num_tokens": 3961444.0, "step": 9050 }, { "epoch": 2.710951526032316, "grad_norm": 47.25, "learning_rate": 4.822461599840415e-06, "loss": 2.3031, "mean_token_accuracy": 0.5322034955024719, "num_tokens": 3966270.0, "step": 9060 }, { "epoch": 2.7139437462597247, "grad_norm": 35.0, "learning_rate": 4.772591262716936e-06, "loss": 1.7478, "mean_token_accuracy": 0.6326006144285202, "num_tokens": 3970793.0, "step": 9070 }, { "epoch": 2.7169359664871333, "grad_norm": 21.25, "learning_rate": 4.722720925593458e-06, "loss": 1.8017, "mean_token_accuracy": 0.6345465123653412, "num_tokens": 3974977.0, "step": 9080 }, { "epoch": 2.719928186714542, "grad_norm": 34.25, "learning_rate": 4.6728505884699784e-06, "loss": 1.8966, "mean_token_accuracy": 0.6168614506721497, "num_tokens": 3980230.0, "step": 9090 }, { "epoch": 2.722920406941951, "grad_norm": 30.0, "learning_rate": 4.622980251346499e-06, "loss": 1.9168, "mean_token_accuracy": 0.5983828186988831, "num_tokens": 3985529.0, "step": 9100 }, { "epoch": 2.7259126271693597, "grad_norm": 41.5, "learning_rate": 4.573109914223021e-06, "loss": 1.7793, "mean_token_accuracy": 0.6030765622854233, "num_tokens": 3989337.0, "step": 9110 }, { "epoch": 2.7289048473967683, "grad_norm": 46.0, "learning_rate": 4.523239577099542e-06, "loss": 2.2499, "mean_token_accuracy": 0.5526516705751419, "num_tokens": 3995079.0, "step": 9120 }, { "epoch": 2.7318970676241774, "grad_norm": 27.0, "learning_rate": 4.4733692399760625e-06, "loss": 1.8144, "mean_token_accuracy": 0.6078069090843201, "num_tokens": 3999875.0, "step": 9130 }, { "epoch": 2.734889287851586, "grad_norm": 35.0, "learning_rate": 4.423498902852583e-06, "loss": 1.819, "mean_token_accuracy": 0.6298864781856537, "num_tokens": 4004317.0, "step": 9140 }, { "epoch": 2.7378815080789947, "grad_norm": 29.25, "learning_rate": 4.373628565729104e-06, "loss": 2.062, "mean_token_accuracy": 0.5830421209335327, "num_tokens": 4009760.0, "step": 9150 }, { "epoch": 2.7408737283064033, "grad_norm": 27.375, "learning_rate": 4.323758228605626e-06, "loss": 2.1911, "mean_token_accuracy": 0.5540030539035797, "num_tokens": 4014489.0, "step": 9160 }, { "epoch": 2.743865948533812, "grad_norm": 23.25, "learning_rate": 4.2738878914821465e-06, "loss": 1.8985, "mean_token_accuracy": 0.5950787127017975, "num_tokens": 4018655.0, "step": 9170 }, { "epoch": 2.7468581687612206, "grad_norm": 25.125, "learning_rate": 4.224017554358667e-06, "loss": 1.5748, "mean_token_accuracy": 0.6524001300334931, "num_tokens": 4023175.0, "step": 9180 }, { "epoch": 2.7498503889886297, "grad_norm": 18.875, "learning_rate": 4.174147217235189e-06, "loss": 1.598, "mean_token_accuracy": 0.6746624201536179, "num_tokens": 4028084.0, "step": 9190 }, { "epoch": 2.7528426092160383, "grad_norm": 41.0, "learning_rate": 4.12427688011171e-06, "loss": 1.7713, "mean_token_accuracy": 0.6220902055501938, "num_tokens": 4032253.0, "step": 9200 }, { "epoch": 2.755834829443447, "grad_norm": 29.0, "learning_rate": 4.0744065429882305e-06, "loss": 1.8611, "mean_token_accuracy": 0.6254755735397339, "num_tokens": 4035717.0, "step": 9210 }, { "epoch": 2.758827049670856, "grad_norm": 34.75, "learning_rate": 4.024536205864752e-06, "loss": 1.786, "mean_token_accuracy": 0.6216672778129577, "num_tokens": 4039463.0, "step": 9220 }, { "epoch": 2.7618192698982647, "grad_norm": 17.125, "learning_rate": 3.974665868741273e-06, "loss": 1.4427, "mean_token_accuracy": 0.6828666806221009, "num_tokens": 4044969.0, "step": 9230 }, { "epoch": 2.7648114901256733, "grad_norm": 39.0, "learning_rate": 3.924795531617794e-06, "loss": 2.1013, "mean_token_accuracy": 0.5784855902194976, "num_tokens": 4049817.0, "step": 9240 }, { "epoch": 2.767803710353082, "grad_norm": 29.25, "learning_rate": 3.874925194494315e-06, "loss": 2.1812, "mean_token_accuracy": 0.5448796033859253, "num_tokens": 4054405.0, "step": 9250 }, { "epoch": 2.7707959305804906, "grad_norm": 30.25, "learning_rate": 3.825054857370836e-06, "loss": 2.0633, "mean_token_accuracy": 0.5805003941059113, "num_tokens": 4059175.0, "step": 9260 }, { "epoch": 2.773788150807899, "grad_norm": 34.5, "learning_rate": 3.7751845202473574e-06, "loss": 1.5614, "mean_token_accuracy": 0.6646808028221131, "num_tokens": 4062650.0, "step": 9270 }, { "epoch": 2.7767803710353083, "grad_norm": 40.75, "learning_rate": 3.7253141831238778e-06, "loss": 2.1483, "mean_token_accuracy": 0.5612222149968147, "num_tokens": 4066947.0, "step": 9280 }, { "epoch": 2.779772591262717, "grad_norm": 17.375, "learning_rate": 3.675443846000399e-06, "loss": 2.0027, "mean_token_accuracy": 0.5933429479599, "num_tokens": 4071052.0, "step": 9290 }, { "epoch": 2.7827648114901256, "grad_norm": 31.0, "learning_rate": 3.6255735088769198e-06, "loss": 1.4943, "mean_token_accuracy": 0.6853374063968658, "num_tokens": 4075442.0, "step": 9300 }, { "epoch": 2.7857570317175346, "grad_norm": 12.375, "learning_rate": 3.575703171753441e-06, "loss": 1.8588, "mean_token_accuracy": 0.600016513466835, "num_tokens": 4080536.0, "step": 9310 }, { "epoch": 2.7887492519449433, "grad_norm": 34.25, "learning_rate": 3.525832834629962e-06, "loss": 1.9483, "mean_token_accuracy": 0.5879528015851975, "num_tokens": 4084930.0, "step": 9320 }, { "epoch": 2.791741472172352, "grad_norm": 29.625, "learning_rate": 3.475962497506483e-06, "loss": 1.6104, "mean_token_accuracy": 0.660100382566452, "num_tokens": 4088724.0, "step": 9330 }, { "epoch": 2.7947336923997605, "grad_norm": 39.0, "learning_rate": 3.426092160383004e-06, "loss": 2.4462, "mean_token_accuracy": 0.5218830987811088, "num_tokens": 4092698.0, "step": 9340 }, { "epoch": 2.797725912627169, "grad_norm": 38.75, "learning_rate": 3.3762218232595254e-06, "loss": 1.7577, "mean_token_accuracy": 0.6282264947891235, "num_tokens": 4097093.0, "step": 9350 }, { "epoch": 2.800718132854578, "grad_norm": 82.5, "learning_rate": 3.3263514861360466e-06, "loss": 2.0219, "mean_token_accuracy": 0.5767106354236603, "num_tokens": 4101442.0, "step": 9360 }, { "epoch": 2.803710353081987, "grad_norm": 30.0, "learning_rate": 3.2764811490125674e-06, "loss": 1.3033, "mean_token_accuracy": 0.7084575563669204, "num_tokens": 4104841.0, "step": 9370 }, { "epoch": 2.8067025733093955, "grad_norm": 32.5, "learning_rate": 3.2266108118890887e-06, "loss": 1.5835, "mean_token_accuracy": 0.6537139505147934, "num_tokens": 4108121.0, "step": 9380 }, { "epoch": 2.809694793536804, "grad_norm": 25.875, "learning_rate": 3.17674047476561e-06, "loss": 1.6108, "mean_token_accuracy": 0.6637275129556656, "num_tokens": 4112032.0, "step": 9390 }, { "epoch": 2.8126870137642133, "grad_norm": 32.75, "learning_rate": 3.1268701376421307e-06, "loss": 1.8535, "mean_token_accuracy": 0.6086684614419937, "num_tokens": 4115984.0, "step": 9400 }, { "epoch": 2.815679233991622, "grad_norm": 35.25, "learning_rate": 3.0769998005186515e-06, "loss": 1.8364, "mean_token_accuracy": 0.6273092567920685, "num_tokens": 4120579.0, "step": 9410 }, { "epoch": 2.8186714542190305, "grad_norm": 33.75, "learning_rate": 3.0271294633951727e-06, "loss": 1.7187, "mean_token_accuracy": 0.6125760763883591, "num_tokens": 4123676.0, "step": 9420 }, { "epoch": 2.821663674446439, "grad_norm": 32.25, "learning_rate": 2.977259126271694e-06, "loss": 1.3378, "mean_token_accuracy": 0.7154994040727616, "num_tokens": 4128761.0, "step": 9430 }, { "epoch": 2.824655894673848, "grad_norm": 33.0, "learning_rate": 2.9273887891482147e-06, "loss": 1.4772, "mean_token_accuracy": 0.6775191038846969, "num_tokens": 4132492.0, "step": 9440 }, { "epoch": 2.827648114901257, "grad_norm": 41.5, "learning_rate": 2.877518452024736e-06, "loss": 2.1733, "mean_token_accuracy": 0.5554867714643479, "num_tokens": 4137709.0, "step": 9450 }, { "epoch": 2.8306403351286655, "grad_norm": 22.125, "learning_rate": 2.827648114901257e-06, "loss": 1.8694, "mean_token_accuracy": 0.5886508136987686, "num_tokens": 4141086.0, "step": 9460 }, { "epoch": 2.833632555356074, "grad_norm": 31.625, "learning_rate": 2.777777777777778e-06, "loss": 1.431, "mean_token_accuracy": 0.6860785514116288, "num_tokens": 4146674.0, "step": 9470 }, { "epoch": 2.836624775583483, "grad_norm": 27.125, "learning_rate": 2.7279074406542987e-06, "loss": 2.2025, "mean_token_accuracy": 0.554258507490158, "num_tokens": 4150892.0, "step": 9480 }, { "epoch": 2.839616995810892, "grad_norm": 45.0, "learning_rate": 2.67803710353082e-06, "loss": 2.46, "mean_token_accuracy": 0.500794404745102, "num_tokens": 4156396.0, "step": 9490 }, { "epoch": 2.8426092160383005, "grad_norm": 17.25, "learning_rate": 2.628166766407341e-06, "loss": 2.0167, "mean_token_accuracy": 0.5807638555765152, "num_tokens": 4160340.0, "step": 9500 }, { "epoch": 2.845601436265709, "grad_norm": 37.75, "learning_rate": 2.578296429283862e-06, "loss": 2.3393, "mean_token_accuracy": 0.5255331352353096, "num_tokens": 4164764.0, "step": 9510 }, { "epoch": 2.848593656493118, "grad_norm": 44.25, "learning_rate": 2.528426092160383e-06, "loss": 1.9644, "mean_token_accuracy": 0.6037229418754577, "num_tokens": 4168859.0, "step": 9520 }, { "epoch": 2.8515858767205264, "grad_norm": 26.5, "learning_rate": 2.4785557550369044e-06, "loss": 1.3245, "mean_token_accuracy": 0.7033275246620179, "num_tokens": 4174167.0, "step": 9530 }, { "epoch": 2.8545780969479355, "grad_norm": 14.4375, "learning_rate": 2.4286854179134256e-06, "loss": 1.2336, "mean_token_accuracy": 0.732453578710556, "num_tokens": 4178884.0, "step": 9540 }, { "epoch": 2.857570317175344, "grad_norm": 18.625, "learning_rate": 2.378815080789946e-06, "loss": 1.5867, "mean_token_accuracy": 0.6451264679431915, "num_tokens": 4182310.0, "step": 9550 }, { "epoch": 2.860562537402753, "grad_norm": 14.0625, "learning_rate": 2.328944743666467e-06, "loss": 1.7747, "mean_token_accuracy": 0.6581346958875656, "num_tokens": 4187444.0, "step": 9560 }, { "epoch": 2.8635547576301614, "grad_norm": 57.0, "learning_rate": 2.2790744065429884e-06, "loss": 1.9725, "mean_token_accuracy": 0.6179261445999146, "num_tokens": 4192217.0, "step": 9570 }, { "epoch": 2.8665469778575705, "grad_norm": 50.75, "learning_rate": 2.229204069419509e-06, "loss": 2.4587, "mean_token_accuracy": 0.4987264096736908, "num_tokens": 4195921.0, "step": 9580 }, { "epoch": 2.869539198084979, "grad_norm": 25.0, "learning_rate": 2.1793337322960304e-06, "loss": 1.3648, "mean_token_accuracy": 0.7000836312770844, "num_tokens": 4200450.0, "step": 9590 }, { "epoch": 2.872531418312388, "grad_norm": 61.75, "learning_rate": 2.1294633951725516e-06, "loss": 1.72, "mean_token_accuracy": 0.6363197565078735, "num_tokens": 4204745.0, "step": 9600 }, { "epoch": 2.8755236385397964, "grad_norm": 39.5, "learning_rate": 2.079593058049073e-06, "loss": 1.6018, "mean_token_accuracy": 0.6612495809793473, "num_tokens": 4208026.0, "step": 9610 }, { "epoch": 2.878515858767205, "grad_norm": 52.75, "learning_rate": 2.029722720925593e-06, "loss": 1.902, "mean_token_accuracy": 0.6017478436231614, "num_tokens": 4212714.0, "step": 9620 }, { "epoch": 2.881508078994614, "grad_norm": 23.25, "learning_rate": 1.9798523838021144e-06, "loss": 2.171, "mean_token_accuracy": 0.5636053562164307, "num_tokens": 4217985.0, "step": 9630 }, { "epoch": 2.884500299222023, "grad_norm": 21.25, "learning_rate": 1.9299820466786356e-06, "loss": 2.16, "mean_token_accuracy": 0.5612272381782532, "num_tokens": 4222542.0, "step": 9640 }, { "epoch": 2.8874925194494314, "grad_norm": 8.875, "learning_rate": 1.8801117095551566e-06, "loss": 2.0578, "mean_token_accuracy": 0.5891127914190293, "num_tokens": 4227617.0, "step": 9650 }, { "epoch": 2.89048473967684, "grad_norm": 30.375, "learning_rate": 1.8302413724316779e-06, "loss": 2.158, "mean_token_accuracy": 0.5701712220907211, "num_tokens": 4232468.0, "step": 9660 }, { "epoch": 2.893476959904249, "grad_norm": 50.0, "learning_rate": 1.7803710353081989e-06, "loss": 1.9668, "mean_token_accuracy": 0.5937556356191636, "num_tokens": 4236640.0, "step": 9670 }, { "epoch": 2.8964691801316578, "grad_norm": 35.5, "learning_rate": 1.7305006981847199e-06, "loss": 2.0277, "mean_token_accuracy": 0.5735794514417648, "num_tokens": 4242045.0, "step": 9680 }, { "epoch": 2.8994614003590664, "grad_norm": 50.0, "learning_rate": 1.680630361061241e-06, "loss": 2.0398, "mean_token_accuracy": 0.5936207205057145, "num_tokens": 4245908.0, "step": 9690 }, { "epoch": 2.902453620586475, "grad_norm": 51.75, "learning_rate": 1.6307600239377617e-06, "loss": 2.1149, "mean_token_accuracy": 0.5694210886955261, "num_tokens": 4250424.0, "step": 9700 }, { "epoch": 2.9054458408138837, "grad_norm": 32.5, "learning_rate": 1.5808896868142829e-06, "loss": 2.1492, "mean_token_accuracy": 0.5405183464288712, "num_tokens": 4254490.0, "step": 9710 }, { "epoch": 2.9084380610412928, "grad_norm": 17.75, "learning_rate": 1.5310193496908039e-06, "loss": 1.8224, "mean_token_accuracy": 0.6002204239368438, "num_tokens": 4258253.0, "step": 9720 }, { "epoch": 2.9114302812687014, "grad_norm": 47.5, "learning_rate": 1.4811490125673251e-06, "loss": 1.8376, "mean_token_accuracy": 0.610488024353981, "num_tokens": 4261515.0, "step": 9730 }, { "epoch": 2.91442250149611, "grad_norm": 27.25, "learning_rate": 1.4312786754438461e-06, "loss": 2.3097, "mean_token_accuracy": 0.5365430325269699, "num_tokens": 4268730.0, "step": 9740 }, { "epoch": 2.917414721723519, "grad_norm": 18.5, "learning_rate": 1.3814083383203671e-06, "loss": 1.8928, "mean_token_accuracy": 0.5975118815898895, "num_tokens": 4272857.0, "step": 9750 }, { "epoch": 2.9204069419509278, "grad_norm": 42.75, "learning_rate": 1.3315380011968881e-06, "loss": 2.1786, "mean_token_accuracy": 0.5498067557811737, "num_tokens": 4277174.0, "step": 9760 }, { "epoch": 2.9233991621783364, "grad_norm": 23.125, "learning_rate": 1.2816676640734091e-06, "loss": 1.3384, "mean_token_accuracy": 0.6992694318294526, "num_tokens": 4281157.0, "step": 9770 }, { "epoch": 2.926391382405745, "grad_norm": 20.75, "learning_rate": 1.2317973269499303e-06, "loss": 2.176, "mean_token_accuracy": 0.5774380654096604, "num_tokens": 4286810.0, "step": 9780 }, { "epoch": 2.9293836026331537, "grad_norm": 17.0, "learning_rate": 1.1819269898264511e-06, "loss": 1.7546, "mean_token_accuracy": 0.6253076940774918, "num_tokens": 4291078.0, "step": 9790 }, { "epoch": 2.9323758228605623, "grad_norm": 32.75, "learning_rate": 1.1320566527029724e-06, "loss": 1.8533, "mean_token_accuracy": 0.6021129012107849, "num_tokens": 4294902.0, "step": 9800 }, { "epoch": 2.9353680430879714, "grad_norm": 55.75, "learning_rate": 1.0821863155794934e-06, "loss": 1.7355, "mean_token_accuracy": 0.6272025167942047, "num_tokens": 4299523.0, "step": 9810 }, { "epoch": 2.93836026331538, "grad_norm": 55.75, "learning_rate": 1.0323159784560146e-06, "loss": 1.8426, "mean_token_accuracy": 0.6203331828117371, "num_tokens": 4305206.0, "step": 9820 }, { "epoch": 2.9413524835427887, "grad_norm": 33.5, "learning_rate": 9.824456413325354e-07, "loss": 2.3632, "mean_token_accuracy": 0.5340803563594818, "num_tokens": 4308887.0, "step": 9830 }, { "epoch": 2.9443447037701977, "grad_norm": 32.25, "learning_rate": 9.325753042090565e-07, "loss": 1.1291, "mean_token_accuracy": 0.7302363455295563, "num_tokens": 4312388.0, "step": 9840 }, { "epoch": 2.9473369239976064, "grad_norm": 41.0, "learning_rate": 8.827049670855776e-07, "loss": 1.5903, "mean_token_accuracy": 0.6539588749408722, "num_tokens": 4317350.0, "step": 9850 }, { "epoch": 2.950329144225015, "grad_norm": 24.25, "learning_rate": 8.328346299620985e-07, "loss": 1.9191, "mean_token_accuracy": 0.6088144451379776, "num_tokens": 4321927.0, "step": 9860 }, { "epoch": 2.9533213644524237, "grad_norm": 63.5, "learning_rate": 7.829642928386196e-07, "loss": 2.393, "mean_token_accuracy": 0.5242181986570358, "num_tokens": 4326242.0, "step": 9870 }, { "epoch": 2.9563135846798323, "grad_norm": 30.625, "learning_rate": 7.330939557151407e-07, "loss": 1.9247, "mean_token_accuracy": 0.5961373239755631, "num_tokens": 4330126.0, "step": 9880 }, { "epoch": 2.959305804907241, "grad_norm": 69.0, "learning_rate": 6.832236185916617e-07, "loss": 1.4818, "mean_token_accuracy": 0.6732258468866348, "num_tokens": 4333877.0, "step": 9890 }, { "epoch": 2.96229802513465, "grad_norm": 32.25, "learning_rate": 6.333532814681827e-07, "loss": 1.7031, "mean_token_accuracy": 0.6317979246377945, "num_tokens": 4338761.0, "step": 9900 }, { "epoch": 2.9652902453620587, "grad_norm": 41.5, "learning_rate": 5.834829443447037e-07, "loss": 2.1448, "mean_token_accuracy": 0.5809542566537857, "num_tokens": 4343273.0, "step": 9910 }, { "epoch": 2.9682824655894673, "grad_norm": 29.125, "learning_rate": 5.336126072212248e-07, "loss": 1.8744, "mean_token_accuracy": 0.6038303703069687, "num_tokens": 4348215.0, "step": 9920 }, { "epoch": 2.9712746858168764, "grad_norm": 21.25, "learning_rate": 4.837422700977458e-07, "loss": 1.7846, "mean_token_accuracy": 0.6139202952384949, "num_tokens": 4351873.0, "step": 9930 }, { "epoch": 2.974266906044285, "grad_norm": 26.375, "learning_rate": 4.3387193297426696e-07, "loss": 1.8021, "mean_token_accuracy": 0.6420397758483887, "num_tokens": 4355736.0, "step": 9940 }, { "epoch": 2.9772591262716936, "grad_norm": 29.375, "learning_rate": 3.8400159585078796e-07, "loss": 1.7598, "mean_token_accuracy": 0.6427461594343186, "num_tokens": 4360600.0, "step": 9950 }, { "epoch": 2.9802513464991023, "grad_norm": 19.375, "learning_rate": 3.34131258727309e-07, "loss": 1.9815, "mean_token_accuracy": 0.6036760896444321, "num_tokens": 4366937.0, "step": 9960 }, { "epoch": 2.983243566726511, "grad_norm": 50.5, "learning_rate": 2.8426092160383003e-07, "loss": 1.7909, "mean_token_accuracy": 0.6144560068845749, "num_tokens": 4370286.0, "step": 9970 }, { "epoch": 2.9862357869539196, "grad_norm": 28.375, "learning_rate": 2.3439058448035108e-07, "loss": 1.7324, "mean_token_accuracy": 0.6381353378295899, "num_tokens": 4374860.0, "step": 9980 }, { "epoch": 2.9892280071813286, "grad_norm": 20.625, "learning_rate": 1.8452024735687214e-07, "loss": 1.4505, "mean_token_accuracy": 0.6755415856838226, "num_tokens": 4379167.0, "step": 9990 }, { "epoch": 2.9922202274087373, "grad_norm": 31.625, "learning_rate": 1.346499102333932e-07, "loss": 1.9699, "mean_token_accuracy": 0.589902862906456, "num_tokens": 4382621.0, "step": 10000 }, { "epoch": 2.995212447636146, "grad_norm": 29.125, "learning_rate": 8.477957310991422e-08, "loss": 1.7263, "mean_token_accuracy": 0.6296306014060974, "num_tokens": 4386420.0, "step": 10010 }, { "epoch": 2.998204667863555, "grad_norm": 34.75, "learning_rate": 3.490923598643527e-08, "loss": 1.5662, "mean_token_accuracy": 0.6742969185113907, "num_tokens": 4390095.0, "step": 10020 }, { "epoch": 3.0, "eval_loss": 2.109537124633789, "eval_mean_token_accuracy": 0.5822922750464026, "eval_num_tokens": 4392732.0, "eval_runtime": 28.1736, "eval_samples_per_second": 14.837, "eval_steps_per_second": 1.881, "step": 10026 } ], "logging_steps": 10, "max_steps": 10026, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.8809891524214784e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }