|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.09758477677482313, |
|
"eval_steps": 10, |
|
"global_step": 400, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00024396194193705782, |
|
"grad_norm": 4.8585286140441895, |
|
"learning_rate": 2.4999420463141455e-07, |
|
"loss": 2.9081, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.00024396194193705782, |
|
"eval_loss": 2.639136552810669, |
|
"eval_runtime": 157.6053, |
|
"eval_samples_per_second": 1.624, |
|
"eval_steps_per_second": 0.812, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.00048792388387411563, |
|
"grad_norm": 3.586596965789795, |
|
"learning_rate": 2.4998840671678217e-07, |
|
"loss": 2.4085, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0007318858258111735, |
|
"grad_norm": 4.514856815338135, |
|
"learning_rate": 2.499826062544247e-07, |
|
"loss": 2.867, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0009758477677482313, |
|
"grad_norm": 3.343158483505249, |
|
"learning_rate": 2.4997680324266246e-07, |
|
"loss": 2.5093, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0012198097096852891, |
|
"grad_norm": 4.163078784942627, |
|
"learning_rate": 2.499709976798144e-07, |
|
"loss": 2.9917, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.001463771651622347, |
|
"grad_norm": 4.113401889801025, |
|
"learning_rate": 2.4996518956419777e-07, |
|
"loss": 2.8629, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0017077335935594047, |
|
"grad_norm": 2.110043525695801, |
|
"learning_rate": 2.499593788941286e-07, |
|
"loss": 2.3666, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.0019516955354964625, |
|
"grad_norm": 3.960318088531494, |
|
"learning_rate": 2.499535656679212e-07, |
|
"loss": 2.6438, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0021956574774335204, |
|
"grad_norm": 3.959432601928711, |
|
"learning_rate": 2.499477498838886e-07, |
|
"loss": 2.6457, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.0024396194193705783, |
|
"grad_norm": 2.219346523284912, |
|
"learning_rate": 2.4994193154034227e-07, |
|
"loss": 2.3086, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0024396194193705783, |
|
"eval_loss": 2.3810582160949707, |
|
"eval_runtime": 157.7847, |
|
"eval_samples_per_second": 1.622, |
|
"eval_steps_per_second": 0.811, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.002683581361307636, |
|
"grad_norm": 2.600377082824707, |
|
"learning_rate": 2.499361106355922e-07, |
|
"loss": 2.3537, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.002927543303244694, |
|
"grad_norm": 3.251347303390503, |
|
"learning_rate": 2.499302871679468e-07, |
|
"loss": 2.483, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0031715052451817514, |
|
"grad_norm": 2.1139895915985107, |
|
"learning_rate": 2.4992446113571303e-07, |
|
"loss": 2.288, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.0034154671871188093, |
|
"grad_norm": 3.138744592666626, |
|
"learning_rate": 2.4991863253719657e-07, |
|
"loss": 2.4845, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.003659429129055867, |
|
"grad_norm": 2.0805656909942627, |
|
"learning_rate": 2.4991280137070126e-07, |
|
"loss": 2.2727, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.003903391070992925, |
|
"grad_norm": 2.57004714012146, |
|
"learning_rate": 2.499069676345297e-07, |
|
"loss": 2.3858, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.004147353012929983, |
|
"grad_norm": 1.8521772623062134, |
|
"learning_rate": 2.499011313269829e-07, |
|
"loss": 2.2256, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.004391314954867041, |
|
"grad_norm": 2.250250816345215, |
|
"learning_rate": 2.498952924463603e-07, |
|
"loss": 2.375, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.004635276896804099, |
|
"grad_norm": 2.7878353595733643, |
|
"learning_rate": 2.498894509909601e-07, |
|
"loss": 2.0609, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.0048792388387411565, |
|
"grad_norm": 2.4599826335906982, |
|
"learning_rate": 2.4988360695907864e-07, |
|
"loss": 2.1944, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0048792388387411565, |
|
"eval_loss": 2.1548237800598145, |
|
"eval_runtime": 157.9, |
|
"eval_samples_per_second": 1.621, |
|
"eval_steps_per_second": 0.811, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.005123200780678214, |
|
"grad_norm": 2.118277072906494, |
|
"learning_rate": 2.49877760349011e-07, |
|
"loss": 2.1107, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.005367162722615272, |
|
"grad_norm": 1.5559130907058716, |
|
"learning_rate": 2.498719111590508e-07, |
|
"loss": 1.992, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.00561112466455233, |
|
"grad_norm": 2.998913049697876, |
|
"learning_rate": 2.498660593874899e-07, |
|
"loss": 2.2592, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.005855086606489388, |
|
"grad_norm": 1.370886206626892, |
|
"learning_rate": 2.4986020503261886e-07, |
|
"loss": 2.0988, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.006099048548426446, |
|
"grad_norm": 1.2692762613296509, |
|
"learning_rate": 2.498543480927266e-07, |
|
"loss": 2.1908, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.006343010490363503, |
|
"grad_norm": 1.6744440793991089, |
|
"learning_rate": 2.4984848856610065e-07, |
|
"loss": 2.2077, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.006586972432300561, |
|
"grad_norm": 1.3982892036437988, |
|
"learning_rate": 2.4984262645102706e-07, |
|
"loss": 2.2539, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.006830934374237619, |
|
"grad_norm": 1.3442888259887695, |
|
"learning_rate": 2.4983676174579014e-07, |
|
"loss": 2.2487, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.0070748963161746765, |
|
"grad_norm": 1.1121150255203247, |
|
"learning_rate": 2.498308944486729e-07, |
|
"loss": 2.024, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.007318858258111734, |
|
"grad_norm": 1.4833574295043945, |
|
"learning_rate": 2.4982502455795676e-07, |
|
"loss": 2.107, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.007318858258111734, |
|
"eval_loss": 2.051649570465088, |
|
"eval_runtime": 158.0175, |
|
"eval_samples_per_second": 1.62, |
|
"eval_steps_per_second": 0.81, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.007562820200048792, |
|
"grad_norm": 1.5546934604644775, |
|
"learning_rate": 2.498191520719216e-07, |
|
"loss": 2.151, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.00780678214198585, |
|
"grad_norm": 1.101186752319336, |
|
"learning_rate": 2.4981327698884575e-07, |
|
"loss": 2.0822, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.008050744083922909, |
|
"grad_norm": 1.13623046875, |
|
"learning_rate": 2.498073993070061e-07, |
|
"loss": 2.0729, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.008294706025859966, |
|
"grad_norm": 1.3326915502548218, |
|
"learning_rate": 2.49801519024678e-07, |
|
"loss": 2.2334, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.008538667967797023, |
|
"grad_norm": 1.1969497203826904, |
|
"learning_rate": 2.497956361401352e-07, |
|
"loss": 2.1631, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.008782629909734082, |
|
"grad_norm": 1.0180652141571045, |
|
"learning_rate": 2.4978975065165004e-07, |
|
"loss": 2.0552, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.009026591851671139, |
|
"grad_norm": 1.7680776119232178, |
|
"learning_rate": 2.497838625574932e-07, |
|
"loss": 2.2854, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.009270553793608197, |
|
"grad_norm": 1.048871397972107, |
|
"learning_rate": 2.497779718559339e-07, |
|
"loss": 2.27, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.009514515735545254, |
|
"grad_norm": 1.0272551774978638, |
|
"learning_rate": 2.497720785452398e-07, |
|
"loss": 1.9276, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.009758477677482313, |
|
"grad_norm": 0.9949386119842529, |
|
"learning_rate": 2.497661826236771e-07, |
|
"loss": 2.1643, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.009758477677482313, |
|
"eval_loss": 1.9904688596725464, |
|
"eval_runtime": 157.9911, |
|
"eval_samples_per_second": 1.62, |
|
"eval_steps_per_second": 0.81, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.01000243961941937, |
|
"grad_norm": 1.153521180152893, |
|
"learning_rate": 2.497602840895103e-07, |
|
"loss": 2.0555, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.010246401561356429, |
|
"grad_norm": 1.1365783214569092, |
|
"learning_rate": 2.4975438294100266e-07, |
|
"loss": 1.9699, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.010490363503293486, |
|
"grad_norm": 1.3392469882965088, |
|
"learning_rate": 2.497484791764155e-07, |
|
"loss": 2.1889, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.010734325445230545, |
|
"grad_norm": 1.1810263395309448, |
|
"learning_rate": 2.4974257279400897e-07, |
|
"loss": 1.9938, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.010978287387167602, |
|
"grad_norm": 0.8270505666732788, |
|
"learning_rate": 2.497366637920414e-07, |
|
"loss": 2.1701, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.01122224932910466, |
|
"grad_norm": 1.1721283197402954, |
|
"learning_rate": 2.497307521687697e-07, |
|
"loss": 2.0702, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.011466211271041717, |
|
"grad_norm": 0.8560613989830017, |
|
"learning_rate": 2.497248379224492e-07, |
|
"loss": 2.0357, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.011710173212978776, |
|
"grad_norm": 2.072547674179077, |
|
"learning_rate": 2.497189210513339e-07, |
|
"loss": 2.1774, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.011954135154915833, |
|
"grad_norm": 1.9676735401153564, |
|
"learning_rate": 2.497130015536758e-07, |
|
"loss": 2.1073, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.012198097096852892, |
|
"grad_norm": 0.868861198425293, |
|
"learning_rate": 2.497070794277257e-07, |
|
"loss": 2.0378, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.012198097096852892, |
|
"eval_loss": 1.958860993385315, |
|
"eval_runtime": 157.3873, |
|
"eval_samples_per_second": 1.627, |
|
"eval_steps_per_second": 0.813, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.012442059038789949, |
|
"grad_norm": 1.0588116645812988, |
|
"learning_rate": 2.497011546717327e-07, |
|
"loss": 2.1439, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.012686020980727006, |
|
"grad_norm": 0.9421451687812805, |
|
"learning_rate": 2.496952272839445e-07, |
|
"loss": 1.9826, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.012929982922664065, |
|
"grad_norm": 0.88938969373703, |
|
"learning_rate": 2.4968929726260705e-07, |
|
"loss": 1.9675, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.013173944864601122, |
|
"grad_norm": 0.8794369101524353, |
|
"learning_rate": 2.4968336460596485e-07, |
|
"loss": 1.9546, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.01341790680653818, |
|
"grad_norm": 0.7067832350730896, |
|
"learning_rate": 2.4967742931226075e-07, |
|
"loss": 1.8798, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.013661868748475237, |
|
"grad_norm": 1.4922388792037964, |
|
"learning_rate": 2.4967149137973625e-07, |
|
"loss": 1.9596, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.013905830690412296, |
|
"grad_norm": 0.8123573660850525, |
|
"learning_rate": 2.496655508066309e-07, |
|
"loss": 1.9043, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.014149792632349353, |
|
"grad_norm": 0.8600869178771973, |
|
"learning_rate": 2.4965960759118313e-07, |
|
"loss": 1.9608, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.014393754574286412, |
|
"grad_norm": 0.7148178219795227, |
|
"learning_rate": 2.4965366173162953e-07, |
|
"loss": 2.0545, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.014637716516223469, |
|
"grad_norm": 0.8177701234817505, |
|
"learning_rate": 2.4964771322620516e-07, |
|
"loss": 2.0236, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.014637716516223469, |
|
"eval_loss": 1.934555172920227, |
|
"eval_runtime": 157.5281, |
|
"eval_samples_per_second": 1.625, |
|
"eval_steps_per_second": 0.813, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.014881678458160527, |
|
"grad_norm": 0.6155992746353149, |
|
"learning_rate": 2.4964176207314356e-07, |
|
"loss": 2.066, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.015125640400097584, |
|
"grad_norm": 0.9341537356376648, |
|
"learning_rate": 2.496358082706767e-07, |
|
"loss": 1.9537, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.015369602342034643, |
|
"grad_norm": 1.3128167390823364, |
|
"learning_rate": 2.4962985181703483e-07, |
|
"loss": 2.0044, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.0156135642839717, |
|
"grad_norm": 1.2402898073196411, |
|
"learning_rate": 2.496238927104469e-07, |
|
"loss": 1.962, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.015857526225908757, |
|
"grad_norm": 0.8261551260948181, |
|
"learning_rate": 2.4961793094913995e-07, |
|
"loss": 2.1043, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.016101488167845818, |
|
"grad_norm": 1.3150850534439087, |
|
"learning_rate": 2.4961196653133975e-07, |
|
"loss": 2.1101, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.016345450109782875, |
|
"grad_norm": 0.5901480317115784, |
|
"learning_rate": 2.4960599945527027e-07, |
|
"loss": 1.7913, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.01658941205171993, |
|
"grad_norm": 1.4552851915359497, |
|
"learning_rate": 2.49600029719154e-07, |
|
"loss": 1.9979, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.01683337399365699, |
|
"grad_norm": 0.6188462376594543, |
|
"learning_rate": 2.495940573212118e-07, |
|
"loss": 1.759, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.017077335935594046, |
|
"grad_norm": 0.6212908029556274, |
|
"learning_rate": 2.4958808225966306e-07, |
|
"loss": 1.9251, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.017077335935594046, |
|
"eval_loss": 1.919191837310791, |
|
"eval_runtime": 157.4683, |
|
"eval_samples_per_second": 1.626, |
|
"eval_steps_per_second": 0.813, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.017321297877531106, |
|
"grad_norm": 0.6586403250694275, |
|
"learning_rate": 2.4958210453272533e-07, |
|
"loss": 2.0447, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.017565259819468163, |
|
"grad_norm": 0.6836444139480591, |
|
"learning_rate": 2.4957612413861483e-07, |
|
"loss": 2.0525, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.01780922176140522, |
|
"grad_norm": 0.7636261582374573, |
|
"learning_rate": 2.4957014107554603e-07, |
|
"loss": 2.0984, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.018053183703342277, |
|
"grad_norm": 0.5293551683425903, |
|
"learning_rate": 2.4956415534173195e-07, |
|
"loss": 1.8238, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.018297145645279338, |
|
"grad_norm": 0.5500568151473999, |
|
"learning_rate": 2.495581669353838e-07, |
|
"loss": 1.8841, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.018541107587216395, |
|
"grad_norm": 0.7883771061897278, |
|
"learning_rate": 2.4955217585471147e-07, |
|
"loss": 1.9951, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.01878506952915345, |
|
"grad_norm": 0.6567949056625366, |
|
"learning_rate": 2.495461820979229e-07, |
|
"loss": 2.0119, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.01902903147109051, |
|
"grad_norm": 0.8867214918136597, |
|
"learning_rate": 2.4954018566322477e-07, |
|
"loss": 1.8826, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.01927299341302757, |
|
"grad_norm": 0.8271172642707825, |
|
"learning_rate": 2.4953418654882195e-07, |
|
"loss": 1.9226, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.019516955354964626, |
|
"grad_norm": 0.5612655878067017, |
|
"learning_rate": 2.495281847529178e-07, |
|
"loss": 1.9987, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.019516955354964626, |
|
"eval_loss": 1.9070545434951782, |
|
"eval_runtime": 157.7755, |
|
"eval_samples_per_second": 1.623, |
|
"eval_steps_per_second": 0.811, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.019760917296901683, |
|
"grad_norm": 0.9746911525726318, |
|
"learning_rate": 2.4952218027371403e-07, |
|
"loss": 2.0771, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.02000487923883874, |
|
"grad_norm": 0.7961266040802002, |
|
"learning_rate": 2.495161731094107e-07, |
|
"loss": 1.9497, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.0202488411807758, |
|
"grad_norm": 0.5901756286621094, |
|
"learning_rate": 2.4951016325820637e-07, |
|
"loss": 1.9636, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.020492803122712858, |
|
"grad_norm": 0.572099506855011, |
|
"learning_rate": 2.4950415071829794e-07, |
|
"loss": 2.0077, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.020736765064649915, |
|
"grad_norm": 0.7444072961807251, |
|
"learning_rate": 2.4949813548788067e-07, |
|
"loss": 1.9713, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.02098072700658697, |
|
"grad_norm": 1.6917086839675903, |
|
"learning_rate": 2.4949211756514816e-07, |
|
"loss": 2.1275, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.02122468894852403, |
|
"grad_norm": 0.4941423535346985, |
|
"learning_rate": 2.494860969482926e-07, |
|
"loss": 2.0304, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.02146865089046109, |
|
"grad_norm": 0.7001515626907349, |
|
"learning_rate": 2.4948007363550424e-07, |
|
"loss": 2.0102, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.021712612832398146, |
|
"grad_norm": 0.6658152341842651, |
|
"learning_rate": 2.4947404762497197e-07, |
|
"loss": 1.6802, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.021956574774335203, |
|
"grad_norm": 0.7706289291381836, |
|
"learning_rate": 2.49468018914883e-07, |
|
"loss": 2.0452, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.021956574774335203, |
|
"eval_loss": 1.8989028930664062, |
|
"eval_runtime": 158.0707, |
|
"eval_samples_per_second": 1.62, |
|
"eval_steps_per_second": 0.81, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.02220053671627226, |
|
"grad_norm": 0.4736054837703705, |
|
"learning_rate": 2.4946198750342283e-07, |
|
"loss": 1.9606, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.02244449865820932, |
|
"grad_norm": 0.6369607448577881, |
|
"learning_rate": 2.4945595338877547e-07, |
|
"loss": 1.9367, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.022688460600146378, |
|
"grad_norm": 0.780017614364624, |
|
"learning_rate": 2.494499165691231e-07, |
|
"loss": 1.8239, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.022932422542083435, |
|
"grad_norm": 1.0048651695251465, |
|
"learning_rate": 2.4944387704264644e-07, |
|
"loss": 1.851, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.02317638448402049, |
|
"grad_norm": 0.5539764165878296, |
|
"learning_rate": 2.494378348075246e-07, |
|
"loss": 1.7927, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.023420346425957552, |
|
"grad_norm": 0.5273501873016357, |
|
"learning_rate": 2.494317898619349e-07, |
|
"loss": 1.7911, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.02366430836789461, |
|
"grad_norm": 1.1313800811767578, |
|
"learning_rate": 2.4942574220405314e-07, |
|
"loss": 1.9152, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.023908270309831666, |
|
"grad_norm": 0.8607046604156494, |
|
"learning_rate": 2.4941969183205344e-07, |
|
"loss": 2.0688, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.024152232251768723, |
|
"grad_norm": 0.9859471321105957, |
|
"learning_rate": 2.494136387441083e-07, |
|
"loss": 2.0554, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.024396194193705784, |
|
"grad_norm": 0.5871405005455017, |
|
"learning_rate": 2.494075829383886e-07, |
|
"loss": 1.8362, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.024396194193705784, |
|
"eval_loss": 1.8896028995513916, |
|
"eval_runtime": 157.8345, |
|
"eval_samples_per_second": 1.622, |
|
"eval_steps_per_second": 0.811, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.02464015613564284, |
|
"grad_norm": 0.5069964528083801, |
|
"learning_rate": 2.494015244130635e-07, |
|
"loss": 1.8013, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.024884118077579898, |
|
"grad_norm": 0.7139447927474976, |
|
"learning_rate": 2.493954631663007e-07, |
|
"loss": 1.8216, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.025128080019516955, |
|
"grad_norm": 0.48631080985069275, |
|
"learning_rate": 2.493893991962659e-07, |
|
"loss": 1.9325, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.02537204196145401, |
|
"grad_norm": 0.5576779842376709, |
|
"learning_rate": 2.493833325011235e-07, |
|
"loss": 2.0052, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.025616003903391072, |
|
"grad_norm": 0.6407865285873413, |
|
"learning_rate": 2.4937726307903606e-07, |
|
"loss": 1.9411, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.02585996584532813, |
|
"grad_norm": 0.7654765248298645, |
|
"learning_rate": 2.493711909281646e-07, |
|
"loss": 1.9438, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.026103927787265186, |
|
"grad_norm": 1.2607905864715576, |
|
"learning_rate": 2.493651160466685e-07, |
|
"loss": 2.0134, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.026347889729202243, |
|
"grad_norm": 0.8633036017417908, |
|
"learning_rate": 2.493590384327053e-07, |
|
"loss": 1.9775, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.026591851671139304, |
|
"grad_norm": 0.7568155527114868, |
|
"learning_rate": 2.49352958084431e-07, |
|
"loss": 1.9074, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.02683581361307636, |
|
"grad_norm": 0.5505961179733276, |
|
"learning_rate": 2.49346875e-07, |
|
"loss": 1.8467, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.02683581361307636, |
|
"eval_loss": 1.8828259706497192, |
|
"eval_runtime": 158.4116, |
|
"eval_samples_per_second": 1.616, |
|
"eval_steps_per_second": 0.808, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.027079775555013418, |
|
"grad_norm": 0.5095446109771729, |
|
"learning_rate": 2.49340789177565e-07, |
|
"loss": 1.9961, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.027323737496950475, |
|
"grad_norm": 1.7097959518432617, |
|
"learning_rate": 2.4933470061527687e-07, |
|
"loss": 1.9335, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.027567699438887535, |
|
"grad_norm": 1.0115768909454346, |
|
"learning_rate": 2.493286093112851e-07, |
|
"loss": 1.8118, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.027811661380824592, |
|
"grad_norm": 0.6412175297737122, |
|
"learning_rate": 2.493225152637374e-07, |
|
"loss": 1.9623, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.02805562332276165, |
|
"grad_norm": 0.5357053875923157, |
|
"learning_rate": 2.4931641847077963e-07, |
|
"loss": 1.8131, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.028299585264698706, |
|
"grad_norm": 0.6828150153160095, |
|
"learning_rate": 2.493103189305562e-07, |
|
"loss": 1.767, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.028543547206635766, |
|
"grad_norm": 0.5804136395454407, |
|
"learning_rate": 2.493042166412099e-07, |
|
"loss": 1.9831, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.028787509148572824, |
|
"grad_norm": 0.6375969052314758, |
|
"learning_rate": 2.492981116008816e-07, |
|
"loss": 1.9651, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.02903147109050988, |
|
"grad_norm": 0.6621755957603455, |
|
"learning_rate": 2.492920038077106e-07, |
|
"loss": 2.1064, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.029275433032446938, |
|
"grad_norm": 0.7436494827270508, |
|
"learning_rate": 2.492858932598346e-07, |
|
"loss": 1.8961, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.029275433032446938, |
|
"eval_loss": 1.8782259225845337, |
|
"eval_runtime": 158.1634, |
|
"eval_samples_per_second": 1.619, |
|
"eval_steps_per_second": 0.809, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.029519394974383995, |
|
"grad_norm": 0.5152058005332947, |
|
"learning_rate": 2.4927977995538954e-07, |
|
"loss": 1.875, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.029763356916321055, |
|
"grad_norm": 0.4640464782714844, |
|
"learning_rate": 2.4927366389250973e-07, |
|
"loss": 1.8429, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.030007318858258112, |
|
"grad_norm": 0.6126062273979187, |
|
"learning_rate": 2.4926754506932774e-07, |
|
"loss": 1.9581, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.03025128080019517, |
|
"grad_norm": 0.5338674187660217, |
|
"learning_rate": 2.4926142348397453e-07, |
|
"loss": 1.9682, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.030495242742132226, |
|
"grad_norm": 0.48220378160476685, |
|
"learning_rate": 2.492552991345792e-07, |
|
"loss": 1.9316, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.030739204684069286, |
|
"grad_norm": 1.0571016073226929, |
|
"learning_rate": 2.4924917201926936e-07, |
|
"loss": 1.9837, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.030983166626006343, |
|
"grad_norm": 0.5729621052742004, |
|
"learning_rate": 2.492430421361708e-07, |
|
"loss": 1.7242, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.0312271285679434, |
|
"grad_norm": 0.9092426896095276, |
|
"learning_rate": 2.4923690948340783e-07, |
|
"loss": 1.8327, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.03147109050988046, |
|
"grad_norm": 0.44636791944503784, |
|
"learning_rate": 2.4923077405910264e-07, |
|
"loss": 2.0464, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.031715052451817514, |
|
"grad_norm": 0.6733670830726624, |
|
"learning_rate": 2.4922463586137616e-07, |
|
"loss": 1.8564, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.031715052451817514, |
|
"eval_loss": 1.873685359954834, |
|
"eval_runtime": 158.2193, |
|
"eval_samples_per_second": 1.618, |
|
"eval_steps_per_second": 0.809, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.03195901439375457, |
|
"grad_norm": 0.6245723366737366, |
|
"learning_rate": 2.4921849488834745e-07, |
|
"loss": 2.0072, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.032202976335691635, |
|
"grad_norm": 0.47369739413261414, |
|
"learning_rate": 2.4921235113813376e-07, |
|
"loss": 2.0033, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.03244693827762869, |
|
"grad_norm": 0.6961667537689209, |
|
"learning_rate": 2.492062046088508e-07, |
|
"loss": 1.8175, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.03269090021956575, |
|
"grad_norm": 0.7953224182128906, |
|
"learning_rate": 2.4920005529861254e-07, |
|
"loss": 1.8035, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.032934862161502806, |
|
"grad_norm": 0.516058087348938, |
|
"learning_rate": 2.491939032055311e-07, |
|
"loss": 1.8855, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.03317882410343986, |
|
"grad_norm": 0.6488027572631836, |
|
"learning_rate": 2.491877483277171e-07, |
|
"loss": 1.9622, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.03342278604537692, |
|
"grad_norm": 0.6827359199523926, |
|
"learning_rate": 2.4918159066327943e-07, |
|
"loss": 1.847, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.03366674798731398, |
|
"grad_norm": 0.4918162226676941, |
|
"learning_rate": 2.49175430210325e-07, |
|
"loss": 1.9214, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.033910709929251034, |
|
"grad_norm": 0.7824620008468628, |
|
"learning_rate": 2.491692669669594e-07, |
|
"loss": 1.8472, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.03415467187118809, |
|
"grad_norm": 0.7084971070289612, |
|
"learning_rate": 2.4916310093128616e-07, |
|
"loss": 1.8638, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.03415467187118809, |
|
"eval_loss": 1.869973063468933, |
|
"eval_runtime": 157.6522, |
|
"eval_samples_per_second": 1.624, |
|
"eval_steps_per_second": 0.812, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.034398633813125155, |
|
"grad_norm": 0.4873005747795105, |
|
"learning_rate": 2.491569321014073e-07, |
|
"loss": 1.9326, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.03464259575506221, |
|
"grad_norm": 0.6483212113380432, |
|
"learning_rate": 2.49150760475423e-07, |
|
"loss": 1.9035, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.03488655769699927, |
|
"grad_norm": 0.46081703901290894, |
|
"learning_rate": 2.4914458605143187e-07, |
|
"loss": 1.9746, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.035130519638936326, |
|
"grad_norm": 0.683131754398346, |
|
"learning_rate": 2.491384088275306e-07, |
|
"loss": 1.8517, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.03537448158087338, |
|
"grad_norm": 0.4871167242527008, |
|
"learning_rate": 2.491322288018143e-07, |
|
"loss": 1.7198, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.03561844352281044, |
|
"grad_norm": 0.6227270364761353, |
|
"learning_rate": 2.4912604597237626e-07, |
|
"loss": 1.8555, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.0358624054647475, |
|
"grad_norm": 0.5372536182403564, |
|
"learning_rate": 2.4911986033730807e-07, |
|
"loss": 1.8245, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.036106367406684554, |
|
"grad_norm": 0.7428392171859741, |
|
"learning_rate": 2.491136718946997e-07, |
|
"loss": 2.0657, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.03635032934862162, |
|
"grad_norm": 0.9103279709815979, |
|
"learning_rate": 2.4910748064263914e-07, |
|
"loss": 1.9042, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.036594291290558675, |
|
"grad_norm": 1.1896861791610718, |
|
"learning_rate": 2.491012865792129e-07, |
|
"loss": 1.8883, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.036594291290558675, |
|
"eval_loss": 1.86661696434021, |
|
"eval_runtime": 158.3624, |
|
"eval_samples_per_second": 1.617, |
|
"eval_steps_per_second": 0.808, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.03683825323249573, |
|
"grad_norm": 0.7221816182136536, |
|
"learning_rate": 2.490950897025056e-07, |
|
"loss": 1.8696, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.03708221517443279, |
|
"grad_norm": 0.5009371042251587, |
|
"learning_rate": 2.4908889001060015e-07, |
|
"loss": 1.923, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.037326177116369846, |
|
"grad_norm": 0.6172135472297668, |
|
"learning_rate": 2.490826875015777e-07, |
|
"loss": 1.9862, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.0375701390583069, |
|
"grad_norm": 0.9549673199653625, |
|
"learning_rate": 2.490764821735178e-07, |
|
"loss": 1.9981, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.03781410100024396, |
|
"grad_norm": 0.5264533758163452, |
|
"learning_rate": 2.4907027402449803e-07, |
|
"loss": 1.8822, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.03805806294218102, |
|
"grad_norm": 0.4591792821884155, |
|
"learning_rate": 2.4906406305259434e-07, |
|
"loss": 1.9013, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.038302024884118074, |
|
"grad_norm": 0.4885839819908142, |
|
"learning_rate": 2.4905784925588094e-07, |
|
"loss": 1.918, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.03854598682605514, |
|
"grad_norm": 0.5201852917671204, |
|
"learning_rate": 2.4905163263243023e-07, |
|
"loss": 1.9607, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.038789948767992195, |
|
"grad_norm": 0.7386835813522339, |
|
"learning_rate": 2.4904541318031294e-07, |
|
"loss": 1.8633, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.03903391070992925, |
|
"grad_norm": 0.5655650496482849, |
|
"learning_rate": 2.49039190897598e-07, |
|
"loss": 1.9402, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.03903391070992925, |
|
"eval_loss": 1.864166021347046, |
|
"eval_runtime": 158.2099, |
|
"eval_samples_per_second": 1.618, |
|
"eval_steps_per_second": 0.809, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.03927787265186631, |
|
"grad_norm": 0.6714135408401489, |
|
"learning_rate": 2.490329657823525e-07, |
|
"loss": 1.7962, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.039521834593803366, |
|
"grad_norm": 0.685165524482727, |
|
"learning_rate": 2.490267378326419e-07, |
|
"loss": 1.9055, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.03976579653574042, |
|
"grad_norm": 0.5688671469688416, |
|
"learning_rate": 2.490205070465299e-07, |
|
"loss": 1.8434, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.04000975847767748, |
|
"grad_norm": 0.6001088619232178, |
|
"learning_rate": 2.4901427342207823e-07, |
|
"loss": 1.8715, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.04025372041961454, |
|
"grad_norm": 0.5576404929161072, |
|
"learning_rate": 2.490080369573472e-07, |
|
"loss": 1.8664, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.0404976823615516, |
|
"grad_norm": 0.4974159002304077, |
|
"learning_rate": 2.4900179765039496e-07, |
|
"loss": 1.7923, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.04074164430348866, |
|
"grad_norm": 0.48131653666496277, |
|
"learning_rate": 2.489955554992782e-07, |
|
"loss": 1.8561, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.040985606245425715, |
|
"grad_norm": 0.49776557087898254, |
|
"learning_rate": 2.489893105020518e-07, |
|
"loss": 1.798, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.04122956818736277, |
|
"grad_norm": 0.7587680220603943, |
|
"learning_rate": 2.489830626567686e-07, |
|
"loss": 1.9562, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.04147353012929983, |
|
"grad_norm": 0.6052951216697693, |
|
"learning_rate": 2.4897681196148e-07, |
|
"loss": 1.9305, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.04147353012929983, |
|
"eval_loss": 1.8620600700378418, |
|
"eval_runtime": 157.5929, |
|
"eval_samples_per_second": 1.624, |
|
"eval_steps_per_second": 0.812, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.041717492071236886, |
|
"grad_norm": 0.5671830177307129, |
|
"learning_rate": 2.4897055841423537e-07, |
|
"loss": 1.8514, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.04196145401317394, |
|
"grad_norm": 0.4015696346759796, |
|
"learning_rate": 2.489643020130825e-07, |
|
"loss": 1.8889, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.042205415955111, |
|
"grad_norm": 0.8785597681999207, |
|
"learning_rate": 2.4895804275606724e-07, |
|
"loss": 1.8905, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.04244937789704806, |
|
"grad_norm": 0.573078453540802, |
|
"learning_rate": 2.489517806412337e-07, |
|
"loss": 2.0164, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.04269333983898512, |
|
"grad_norm": 0.48950624465942383, |
|
"learning_rate": 2.4894551566662435e-07, |
|
"loss": 2.0895, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.04293730178092218, |
|
"grad_norm": 0.5515138506889343, |
|
"learning_rate": 2.4893924783027967e-07, |
|
"loss": 1.9163, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.043181263722859235, |
|
"grad_norm": 0.4793028235435486, |
|
"learning_rate": 2.4893297713023835e-07, |
|
"loss": 1.8189, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.04342522566479629, |
|
"grad_norm": 0.5240328311920166, |
|
"learning_rate": 2.4892670356453745e-07, |
|
"loss": 1.9361, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.04366918760673335, |
|
"grad_norm": 0.5339527726173401, |
|
"learning_rate": 2.4892042713121207e-07, |
|
"loss": 1.9248, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.043913149548670406, |
|
"grad_norm": 0.468458890914917, |
|
"learning_rate": 2.4891414782829566e-07, |
|
"loss": 1.9061, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.043913149548670406, |
|
"eval_loss": 1.8581455945968628, |
|
"eval_runtime": 157.6293, |
|
"eval_samples_per_second": 1.624, |
|
"eval_steps_per_second": 0.812, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.04415711149060746, |
|
"grad_norm": 0.5706861019134521, |
|
"learning_rate": 2.4890786565381976e-07, |
|
"loss": 1.8752, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.04440107343254452, |
|
"grad_norm": 0.573175311088562, |
|
"learning_rate": 2.489015806058142e-07, |
|
"loss": 1.9895, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.044645035374481584, |
|
"grad_norm": 1.2761479616165161, |
|
"learning_rate": 2.4889529268230683e-07, |
|
"loss": 1.9355, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.04488899731641864, |
|
"grad_norm": 3.7102456092834473, |
|
"learning_rate": 2.4888900188132405e-07, |
|
"loss": 1.9278, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.0451329592583557, |
|
"grad_norm": 0.5471494793891907, |
|
"learning_rate": 2.4888270820089003e-07, |
|
"loss": 1.9218, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.045376921200292755, |
|
"grad_norm": 0.9872457385063171, |
|
"learning_rate": 2.488764116390274e-07, |
|
"loss": 1.936, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.04562088314222981, |
|
"grad_norm": 0.528155505657196, |
|
"learning_rate": 2.488701121937568e-07, |
|
"loss": 1.9575, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.04586484508416687, |
|
"grad_norm": 0.51887446641922, |
|
"learning_rate": 2.488638098630973e-07, |
|
"loss": 1.8338, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.046108807026103926, |
|
"grad_norm": 0.4276951253414154, |
|
"learning_rate": 2.4885750464506606e-07, |
|
"loss": 2.0073, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.04635276896804098, |
|
"grad_norm": 0.5127749443054199, |
|
"learning_rate": 2.488511965376782e-07, |
|
"loss": 1.9237, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.04635276896804098, |
|
"eval_loss": 1.856198787689209, |
|
"eval_runtime": 157.9524, |
|
"eval_samples_per_second": 1.621, |
|
"eval_steps_per_second": 0.81, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.04659673090997804, |
|
"grad_norm": 0.5734567046165466, |
|
"learning_rate": 2.488448855389473e-07, |
|
"loss": 1.955, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.046840692851915104, |
|
"grad_norm": 0.4853633940219879, |
|
"learning_rate": 2.48838571646885e-07, |
|
"loss": 1.9313, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.04708465479385216, |
|
"grad_norm": 0.8106932044029236, |
|
"learning_rate": 2.488322548595012e-07, |
|
"loss": 1.9164, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.04732861673578922, |
|
"grad_norm": 0.6387647986412048, |
|
"learning_rate": 2.488259351748038e-07, |
|
"loss": 2.0275, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.047572578677726275, |
|
"grad_norm": 0.48080340027809143, |
|
"learning_rate": 2.48819612590799e-07, |
|
"loss": 1.966, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.04781654061966333, |
|
"grad_norm": 0.464213103055954, |
|
"learning_rate": 2.4881328710549126e-07, |
|
"loss": 1.8753, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.04806050256160039, |
|
"grad_norm": 0.7000899314880371, |
|
"learning_rate": 2.48806958716883e-07, |
|
"loss": 2.0136, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.048304464503537446, |
|
"grad_norm": 0.474881112575531, |
|
"learning_rate": 2.488006274229749e-07, |
|
"loss": 1.9193, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.0485484264454745, |
|
"grad_norm": 0.5639634132385254, |
|
"learning_rate": 2.4879429322176583e-07, |
|
"loss": 1.8432, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.04879238838741157, |
|
"grad_norm": 0.41461923718452454, |
|
"learning_rate": 2.4878795611125284e-07, |
|
"loss": 1.8943, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.04879238838741157, |
|
"eval_loss": 1.8539921045303345, |
|
"eval_runtime": 157.8624, |
|
"eval_samples_per_second": 1.622, |
|
"eval_steps_per_second": 0.811, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.049036350329348624, |
|
"grad_norm": 0.5546320080757141, |
|
"learning_rate": 2.487816160894311e-07, |
|
"loss": 1.8561, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.04928031227128568, |
|
"grad_norm": 0.4563431441783905, |
|
"learning_rate": 2.4877527315429387e-07, |
|
"loss": 1.9516, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.04952427421322274, |
|
"grad_norm": 0.48537513613700867, |
|
"learning_rate": 2.4876892730383267e-07, |
|
"loss": 2.0183, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.049768236155159795, |
|
"grad_norm": 0.5398459434509277, |
|
"learning_rate": 2.4876257853603717e-07, |
|
"loss": 1.9771, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.05001219809709685, |
|
"grad_norm": 0.47974419593811035, |
|
"learning_rate": 2.4875622684889513e-07, |
|
"loss": 1.8562, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.05025616003903391, |
|
"grad_norm": 0.42705652117729187, |
|
"learning_rate": 2.4874987224039246e-07, |
|
"loss": 1.9547, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.050500121980970966, |
|
"grad_norm": 1.4771904945373535, |
|
"learning_rate": 2.4874351470851334e-07, |
|
"loss": 1.9176, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.05074408392290802, |
|
"grad_norm": 0.4936388432979584, |
|
"learning_rate": 2.4873715425123986e-07, |
|
"loss": 1.986, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.05098804586484509, |
|
"grad_norm": 0.45525163412094116, |
|
"learning_rate": 2.4873079086655244e-07, |
|
"loss": 1.9623, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.051232007806782144, |
|
"grad_norm": 0.429779052734375, |
|
"learning_rate": 2.487244245524296e-07, |
|
"loss": 1.7466, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.051232007806782144, |
|
"eval_loss": 1.8527003526687622, |
|
"eval_runtime": 157.4992, |
|
"eval_samples_per_second": 1.625, |
|
"eval_steps_per_second": 0.813, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.0514759697487192, |
|
"grad_norm": 0.4459904432296753, |
|
"learning_rate": 2.487180553068481e-07, |
|
"loss": 1.9284, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.05171993169065626, |
|
"grad_norm": 0.8717539310455322, |
|
"learning_rate": 2.487116831277826e-07, |
|
"loss": 1.7543, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.051963893632593315, |
|
"grad_norm": 0.7227014303207397, |
|
"learning_rate": 2.4870530801320607e-07, |
|
"loss": 1.8261, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.05220785557453037, |
|
"grad_norm": 0.4853971302509308, |
|
"learning_rate": 2.486989299610895e-07, |
|
"loss": 1.9214, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.05245181751646743, |
|
"grad_norm": 0.5626842975616455, |
|
"learning_rate": 2.4869254896940207e-07, |
|
"loss": 1.8116, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.052695779458404486, |
|
"grad_norm": 0.4326629340648651, |
|
"learning_rate": 2.4868616503611124e-07, |
|
"loss": 1.7844, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.05293974140034155, |
|
"grad_norm": 0.43978720903396606, |
|
"learning_rate": 2.486797781591823e-07, |
|
"loss": 1.7327, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.05318370334227861, |
|
"grad_norm": 1.3520264625549316, |
|
"learning_rate": 2.4867338833657884e-07, |
|
"loss": 1.9084, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.053427665284215664, |
|
"grad_norm": 1.791759967803955, |
|
"learning_rate": 2.4866699556626256e-07, |
|
"loss": 2.0314, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.05367162722615272, |
|
"grad_norm": 0.7393069267272949, |
|
"learning_rate": 2.486605998461933e-07, |
|
"loss": 1.8518, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.05367162722615272, |
|
"eval_loss": 1.850144386291504, |
|
"eval_runtime": 156.9992, |
|
"eval_samples_per_second": 1.631, |
|
"eval_steps_per_second": 0.815, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.05391558916808978, |
|
"grad_norm": 0.4648591876029968, |
|
"learning_rate": 2.4865420117432884e-07, |
|
"loss": 1.9889, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.054159551110026835, |
|
"grad_norm": 0.4539943337440491, |
|
"learning_rate": 2.4864779954862536e-07, |
|
"loss": 1.8777, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.05440351305196389, |
|
"grad_norm": 23.188865661621094, |
|
"learning_rate": 2.486413949670369e-07, |
|
"loss": 1.9913, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.05464747499390095, |
|
"grad_norm": 0.5861213803291321, |
|
"learning_rate": 2.486349874275158e-07, |
|
"loss": 1.5643, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.054891436935838006, |
|
"grad_norm": 0.4710935056209564, |
|
"learning_rate": 2.486285769280123e-07, |
|
"loss": 1.9896, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.05513539887777507, |
|
"grad_norm": 0.5323078632354736, |
|
"learning_rate": 2.48622163466475e-07, |
|
"loss": 1.7714, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.05537936081971213, |
|
"grad_norm": 0.5247780680656433, |
|
"learning_rate": 2.486157470408504e-07, |
|
"loss": 1.9497, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.055623322761649184, |
|
"grad_norm": 0.48543304204940796, |
|
"learning_rate": 2.4860932764908314e-07, |
|
"loss": 1.9012, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.05586728470358624, |
|
"grad_norm": 0.5412744879722595, |
|
"learning_rate": 2.486029052891161e-07, |
|
"loss": 1.8044, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.0561112466455233, |
|
"grad_norm": 0.4210870563983917, |
|
"learning_rate": 2.4859647995889003e-07, |
|
"loss": 1.7522, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.0561112466455233, |
|
"eval_loss": 1.845929741859436, |
|
"eval_runtime": 157.1033, |
|
"eval_samples_per_second": 1.63, |
|
"eval_steps_per_second": 0.815, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.056355208587460355, |
|
"grad_norm": 0.49198633432388306, |
|
"learning_rate": 2.4859005165634397e-07, |
|
"loss": 1.6787, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.05659917052939741, |
|
"grad_norm": 0.4444589912891388, |
|
"learning_rate": 2.4858362037941493e-07, |
|
"loss": 1.8522, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.05684313247133447, |
|
"grad_norm": 0.42611005902290344, |
|
"learning_rate": 2.485771861260381e-07, |
|
"loss": 1.773, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.05708709441327153, |
|
"grad_norm": 0.44933363795280457, |
|
"learning_rate": 2.485707488941467e-07, |
|
"loss": 1.839, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.05733105635520859, |
|
"grad_norm": 0.510879397392273, |
|
"learning_rate": 2.48564308681672e-07, |
|
"loss": 1.9736, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.05757501829714565, |
|
"grad_norm": 0.48234203457832336, |
|
"learning_rate": 2.485578654865435e-07, |
|
"loss": 1.8358, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.057818980239082704, |
|
"grad_norm": 0.5287805795669556, |
|
"learning_rate": 2.485514193066886e-07, |
|
"loss": 1.7455, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.05806294218101976, |
|
"grad_norm": 0.4200873374938965, |
|
"learning_rate": 2.485449701400329e-07, |
|
"loss": 1.8146, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.05830690412295682, |
|
"grad_norm": 0.42826953530311584, |
|
"learning_rate": 2.485385179845001e-07, |
|
"loss": 1.8783, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.058550866064893875, |
|
"grad_norm": 0.6160483360290527, |
|
"learning_rate": 2.4853206283801187e-07, |
|
"loss": 2.0157, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.058550866064893875, |
|
"eval_loss": 1.8428621292114258, |
|
"eval_runtime": 157.1726, |
|
"eval_samples_per_second": 1.629, |
|
"eval_steps_per_second": 0.814, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.05879482800683093, |
|
"grad_norm": 0.517240047454834, |
|
"learning_rate": 2.4852560469848794e-07, |
|
"loss": 1.8066, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.05903878994876799, |
|
"grad_norm": 0.45431217551231384, |
|
"learning_rate": 2.4851914356384624e-07, |
|
"loss": 1.763, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.05928275189070505, |
|
"grad_norm": 0.5374858975410461, |
|
"learning_rate": 2.485126794320027e-07, |
|
"loss": 1.7991, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.05952671383264211, |
|
"grad_norm": 0.4840785562992096, |
|
"learning_rate": 2.4850621230087125e-07, |
|
"loss": 1.9219, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.05977067577457917, |
|
"grad_norm": 0.6035332083702087, |
|
"learning_rate": 2.4849974216836405e-07, |
|
"loss": 1.8103, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.060014637716516224, |
|
"grad_norm": 0.44333499670028687, |
|
"learning_rate": 2.4849326903239115e-07, |
|
"loss": 1.8412, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.06025859965845328, |
|
"grad_norm": 0.7768390774726868, |
|
"learning_rate": 2.4848679289086074e-07, |
|
"loss": 1.9089, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.06050256160039034, |
|
"grad_norm": 0.5787532329559326, |
|
"learning_rate": 2.4848031374167913e-07, |
|
"loss": 1.9024, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.060746523542327395, |
|
"grad_norm": 0.4455646276473999, |
|
"learning_rate": 2.484738315827505e-07, |
|
"loss": 1.9293, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.06099048548426445, |
|
"grad_norm": 0.48859095573425293, |
|
"learning_rate": 2.484673464119773e-07, |
|
"loss": 1.8183, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.06099048548426445, |
|
"eval_loss": 1.8416523933410645, |
|
"eval_runtime": 156.2376, |
|
"eval_samples_per_second": 1.639, |
|
"eval_steps_per_second": 0.819, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.061234447426201516, |
|
"grad_norm": 0.4281693398952484, |
|
"learning_rate": 2.484608582272598e-07, |
|
"loss": 1.9258, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.06147840936813857, |
|
"grad_norm": 0.43426513671875, |
|
"learning_rate": 2.4845436702649656e-07, |
|
"loss": 2.0341, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.06172237131007563, |
|
"grad_norm": 0.5216272473335266, |
|
"learning_rate": 2.48447872807584e-07, |
|
"loss": 1.8391, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.06196633325201269, |
|
"grad_norm": 0.4329265356063843, |
|
"learning_rate": 2.484413755684167e-07, |
|
"loss": 1.8692, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.062210295193949744, |
|
"grad_norm": 1.1542620658874512, |
|
"learning_rate": 2.484348753068872e-07, |
|
"loss": 1.9009, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.0624542571358868, |
|
"grad_norm": 0.44065535068511963, |
|
"learning_rate": 2.484283720208861e-07, |
|
"loss": 1.7906, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.06269821907782386, |
|
"grad_norm": 0.4028589129447937, |
|
"learning_rate": 2.4842186570830207e-07, |
|
"loss": 1.821, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.06294218101976091, |
|
"grad_norm": 0.5287508964538574, |
|
"learning_rate": 2.484153563670218e-07, |
|
"loss": 1.6887, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.06318614296169797, |
|
"grad_norm": 0.472429096698761, |
|
"learning_rate": 2.4840884399493006e-07, |
|
"loss": 1.8086, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.06343010490363503, |
|
"grad_norm": 0.40466898679733276, |
|
"learning_rate": 2.4840232858990943e-07, |
|
"loss": 1.8095, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.06343010490363503, |
|
"eval_loss": 1.8428053855895996, |
|
"eval_runtime": 156.6484, |
|
"eval_samples_per_second": 1.634, |
|
"eval_steps_per_second": 0.817, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.06367406684557209, |
|
"grad_norm": 0.5649131536483765, |
|
"learning_rate": 2.4839581014984084e-07, |
|
"loss": 1.8726, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.06391802878750914, |
|
"grad_norm": 0.5180754065513611, |
|
"learning_rate": 2.48389288672603e-07, |
|
"loss": 1.9934, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.0641619907294462, |
|
"grad_norm": 0.4884182810783386, |
|
"learning_rate": 2.483827641560728e-07, |
|
"loss": 1.7776, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.06440595267138327, |
|
"grad_norm": 0.5376865267753601, |
|
"learning_rate": 2.48376236598125e-07, |
|
"loss": 1.7831, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.06464991461332033, |
|
"grad_norm": 0.7305421829223633, |
|
"learning_rate": 2.4836970599663255e-07, |
|
"loss": 1.8499, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.06489387655525738, |
|
"grad_norm": 0.4067825376987457, |
|
"learning_rate": 2.4836317234946626e-07, |
|
"loss": 1.9762, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.06513783849719444, |
|
"grad_norm": 1.1095890998840332, |
|
"learning_rate": 2.48356635654495e-07, |
|
"loss": 1.884, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.0653818004391315, |
|
"grad_norm": 1.5947470664978027, |
|
"learning_rate": 2.4835009590958575e-07, |
|
"loss": 1.8838, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.06562576238106856, |
|
"grad_norm": 0.5433115363121033, |
|
"learning_rate": 2.483435531126034e-07, |
|
"loss": 1.9129, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.06586972432300561, |
|
"grad_norm": 0.43899622559547424, |
|
"learning_rate": 2.483370072614108e-07, |
|
"loss": 1.7831, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.06586972432300561, |
|
"eval_loss": 1.839111328125, |
|
"eval_runtime": 156.1734, |
|
"eval_samples_per_second": 1.639, |
|
"eval_steps_per_second": 0.82, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.06611368626494267, |
|
"grad_norm": 0.44969475269317627, |
|
"learning_rate": 2.483304583538689e-07, |
|
"loss": 1.901, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.06635764820687973, |
|
"grad_norm": 0.42426538467407227, |
|
"learning_rate": 2.4832390638783666e-07, |
|
"loss": 1.8534, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.06660161014881678, |
|
"grad_norm": 0.511674702167511, |
|
"learning_rate": 2.4831735136117095e-07, |
|
"loss": 1.9139, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.06684557209075384, |
|
"grad_norm": 0.43454718589782715, |
|
"learning_rate": 2.4831079327172674e-07, |
|
"loss": 1.9442, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.0670895340326909, |
|
"grad_norm": 0.4460424780845642, |
|
"learning_rate": 2.4830423211735686e-07, |
|
"loss": 1.9378, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.06733349597462795, |
|
"grad_norm": 0.6298746466636658, |
|
"learning_rate": 2.482976678959123e-07, |
|
"loss": 1.8372, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.06757745791656501, |
|
"grad_norm": 0.44850224256515503, |
|
"learning_rate": 2.4829110060524197e-07, |
|
"loss": 1.8511, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.06782141985850207, |
|
"grad_norm": 0.4357118308544159, |
|
"learning_rate": 2.482845302431927e-07, |
|
"loss": 1.763, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.06806538180043913, |
|
"grad_norm": 0.3952440023422241, |
|
"learning_rate": 2.4827795680760933e-07, |
|
"loss": 1.9439, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.06830934374237618, |
|
"grad_norm": 0.4903910458087921, |
|
"learning_rate": 2.482713802963348e-07, |
|
"loss": 1.811, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.06830934374237618, |
|
"eval_loss": 1.8365715742111206, |
|
"eval_runtime": 157.7942, |
|
"eval_samples_per_second": 1.622, |
|
"eval_steps_per_second": 0.811, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.06855330568431325, |
|
"grad_norm": 0.5027759075164795, |
|
"learning_rate": 2.4826480070720985e-07, |
|
"loss": 1.9209, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.06879726762625031, |
|
"grad_norm": 0.4530917704105377, |
|
"learning_rate": 2.482582180380734e-07, |
|
"loss": 1.8037, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.06904122956818737, |
|
"grad_norm": 0.4016598165035248, |
|
"learning_rate": 2.482516322867622e-07, |
|
"loss": 1.8756, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.06928519151012442, |
|
"grad_norm": 0.4351702630519867, |
|
"learning_rate": 2.48245043451111e-07, |
|
"loss": 2.0021, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.06952915345206148, |
|
"grad_norm": 0.4535478949546814, |
|
"learning_rate": 2.482384515289525e-07, |
|
"loss": 1.8903, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.06977311539399854, |
|
"grad_norm": 0.4296678304672241, |
|
"learning_rate": 2.482318565181174e-07, |
|
"loss": 1.916, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.0700170773359356, |
|
"grad_norm": 0.6348395347595215, |
|
"learning_rate": 2.4822525841643453e-07, |
|
"loss": 1.895, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.07026103927787265, |
|
"grad_norm": 0.4949493706226349, |
|
"learning_rate": 2.482186572217303e-07, |
|
"loss": 2.07, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.07050500121980971, |
|
"grad_norm": 0.4145565927028656, |
|
"learning_rate": 2.482120529318294e-07, |
|
"loss": 1.8886, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.07074896316174677, |
|
"grad_norm": 0.5197605490684509, |
|
"learning_rate": 2.482054455445545e-07, |
|
"loss": 1.876, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.07074896316174677, |
|
"eval_loss": 1.8359309434890747, |
|
"eval_runtime": 156.5279, |
|
"eval_samples_per_second": 1.635, |
|
"eval_steps_per_second": 0.818, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.07099292510368382, |
|
"grad_norm": 0.42653581500053406, |
|
"learning_rate": 2.481988350577259e-07, |
|
"loss": 1.8605, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.07123688704562088, |
|
"grad_norm": 0.3822322189807892, |
|
"learning_rate": 2.481922214691622e-07, |
|
"loss": 1.844, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.07148084898755794, |
|
"grad_norm": 0.4121018946170807, |
|
"learning_rate": 2.481856047766798e-07, |
|
"loss": 1.9521, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.071724810929495, |
|
"grad_norm": 0.3980840742588043, |
|
"learning_rate": 2.4817898497809304e-07, |
|
"loss": 1.8008, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.07196877287143205, |
|
"grad_norm": 0.7482399344444275, |
|
"learning_rate": 2.4817236207121427e-07, |
|
"loss": 1.8344, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.07221273481336911, |
|
"grad_norm": 0.5517648458480835, |
|
"learning_rate": 2.4816573605385374e-07, |
|
"loss": 1.9856, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.07245669675530617, |
|
"grad_norm": 0.3954029381275177, |
|
"learning_rate": 2.481591069238197e-07, |
|
"loss": 1.7306, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.07270065869724324, |
|
"grad_norm": 0.6213473677635193, |
|
"learning_rate": 2.481524746789182e-07, |
|
"loss": 1.873, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.0729446206391803, |
|
"grad_norm": 0.42206960916519165, |
|
"learning_rate": 2.4814583931695343e-07, |
|
"loss": 1.9073, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.07318858258111735, |
|
"grad_norm": 0.4138680100440979, |
|
"learning_rate": 2.4813920083572734e-07, |
|
"loss": 1.7581, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.07318858258111735, |
|
"eval_loss": 1.8346822261810303, |
|
"eval_runtime": 156.8712, |
|
"eval_samples_per_second": 1.632, |
|
"eval_steps_per_second": 0.816, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.07343254452305441, |
|
"grad_norm": 0.9438842535018921, |
|
"learning_rate": 2.481325592330399e-07, |
|
"loss": 1.8472, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.07367650646499146, |
|
"grad_norm": 0.3860412538051605, |
|
"learning_rate": 2.4812591450668896e-07, |
|
"loss": 1.8402, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.07392046840692852, |
|
"grad_norm": 0.33647987246513367, |
|
"learning_rate": 2.4811926665447034e-07, |
|
"loss": 1.9474, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.07416443034886558, |
|
"grad_norm": 0.3667222559452057, |
|
"learning_rate": 2.481126156741779e-07, |
|
"loss": 1.8661, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.07440839229080264, |
|
"grad_norm": 0.47111183404922485, |
|
"learning_rate": 2.481059615636031e-07, |
|
"loss": 1.7963, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.07465235423273969, |
|
"grad_norm": 0.4970519244670868, |
|
"learning_rate": 2.480993043205356e-07, |
|
"loss": 1.7931, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.07489631617467675, |
|
"grad_norm": 0.43172699213027954, |
|
"learning_rate": 2.4809264394276297e-07, |
|
"loss": 1.8096, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.0751402781166138, |
|
"grad_norm": 1.3444660902023315, |
|
"learning_rate": 2.4808598042807057e-07, |
|
"loss": 1.9013, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.07538424005855086, |
|
"grad_norm": 0.39566361904144287, |
|
"learning_rate": 2.4807931377424167e-07, |
|
"loss": 1.8494, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.07562820200048792, |
|
"grad_norm": 0.37536919116973877, |
|
"learning_rate": 2.4807264397905757e-07, |
|
"loss": 1.9214, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.07562820200048792, |
|
"eval_loss": 1.8326919078826904, |
|
"eval_runtime": 156.8066, |
|
"eval_samples_per_second": 1.633, |
|
"eval_steps_per_second": 0.816, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.07587216394242498, |
|
"grad_norm": 0.515691339969635, |
|
"learning_rate": 2.480659710402974e-07, |
|
"loss": 1.8315, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.07611612588436203, |
|
"grad_norm": 0.5210254192352295, |
|
"learning_rate": 2.480592949557383e-07, |
|
"loss": 1.9244, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.07636008782629909, |
|
"grad_norm": 0.5208694338798523, |
|
"learning_rate": 2.4805261572315513e-07, |
|
"loss": 1.8838, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.07660404976823615, |
|
"grad_norm": 0.4405214786529541, |
|
"learning_rate": 2.480459333403207e-07, |
|
"loss": 1.816, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.07684801171017322, |
|
"grad_norm": 0.4438663423061371, |
|
"learning_rate": 2.480392478050059e-07, |
|
"loss": 1.7578, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.07709197365211028, |
|
"grad_norm": 0.4870030879974365, |
|
"learning_rate": 2.4803255911497927e-07, |
|
"loss": 2.0076, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.07733593559404733, |
|
"grad_norm": 0.44352516531944275, |
|
"learning_rate": 2.4802586726800744e-07, |
|
"loss": 1.8897, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.07757989753598439, |
|
"grad_norm": 0.40144485235214233, |
|
"learning_rate": 2.4801917226185476e-07, |
|
"loss": 1.9574, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.07782385947792145, |
|
"grad_norm": 0.4221437871456146, |
|
"learning_rate": 2.480124740942837e-07, |
|
"loss": 1.8748, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.0780678214198585, |
|
"grad_norm": 0.39843979477882385, |
|
"learning_rate": 2.480057727630543e-07, |
|
"loss": 1.996, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.0780678214198585, |
|
"eval_loss": 1.8313816785812378, |
|
"eval_runtime": 156.6502, |
|
"eval_samples_per_second": 1.634, |
|
"eval_steps_per_second": 0.817, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.07831178336179556, |
|
"grad_norm": 0.7306655645370483, |
|
"learning_rate": 2.479990682659248e-07, |
|
"loss": 1.8732, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.07855574530373262, |
|
"grad_norm": 0.46410149335861206, |
|
"learning_rate": 2.4799236060065104e-07, |
|
"loss": 1.9037, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.07879970724566968, |
|
"grad_norm": 0.4528440833091736, |
|
"learning_rate": 2.47985649764987e-07, |
|
"loss": 1.8296, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.07904366918760673, |
|
"grad_norm": 0.5731680989265442, |
|
"learning_rate": 2.4797893575668437e-07, |
|
"loss": 1.839, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.07928763112954379, |
|
"grad_norm": 0.3977627456188202, |
|
"learning_rate": 2.4797221857349267e-07, |
|
"loss": 1.9664, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.07953159307148085, |
|
"grad_norm": 0.7255275249481201, |
|
"learning_rate": 2.4796549821315954e-07, |
|
"loss": 1.8649, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.0797755550134179, |
|
"grad_norm": 0.4904336929321289, |
|
"learning_rate": 2.479587746734302e-07, |
|
"loss": 1.945, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.08001951695535496, |
|
"grad_norm": 0.46819430589675903, |
|
"learning_rate": 2.4795204795204794e-07, |
|
"loss": 1.894, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.08026347889729202, |
|
"grad_norm": 0.8833802938461304, |
|
"learning_rate": 2.479453180467538e-07, |
|
"loss": 1.8628, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.08050744083922907, |
|
"grad_norm": 0.44334056973457336, |
|
"learning_rate": 2.479385849552867e-07, |
|
"loss": 1.8583, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.08050744083922907, |
|
"eval_loss": 1.8302311897277832, |
|
"eval_runtime": 156.8163, |
|
"eval_samples_per_second": 1.632, |
|
"eval_steps_per_second": 0.816, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.08075140278116613, |
|
"grad_norm": 0.4154978394508362, |
|
"learning_rate": 2.479318486753834e-07, |
|
"loss": 1.7181, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.0809953647231032, |
|
"grad_norm": 0.5498473048210144, |
|
"learning_rate": 2.479251092047787e-07, |
|
"loss": 2.1092, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.08123932666504026, |
|
"grad_norm": 0.41959795355796814, |
|
"learning_rate": 2.4791836654120494e-07, |
|
"loss": 1.853, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.08148328860697732, |
|
"grad_norm": 0.48775970935821533, |
|
"learning_rate": 2.4791162068239256e-07, |
|
"loss": 1.878, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.08172725054891437, |
|
"grad_norm": 1.0387691259384155, |
|
"learning_rate": 2.4790487162606977e-07, |
|
"loss": 1.9639, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.08197121249085143, |
|
"grad_norm": 0.4307618737220764, |
|
"learning_rate": 2.478981193699626e-07, |
|
"loss": 1.798, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.08221517443278849, |
|
"grad_norm": 0.8073650598526001, |
|
"learning_rate": 2.478913639117949e-07, |
|
"loss": 1.8512, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.08245913637472554, |
|
"grad_norm": 0.785327136516571, |
|
"learning_rate": 2.478846052492885e-07, |
|
"loss": 1.8926, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.0827030983166626, |
|
"grad_norm": 0.4723658263683319, |
|
"learning_rate": 2.478778433801629e-07, |
|
"loss": 1.9997, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.08294706025859966, |
|
"grad_norm": 0.4107203185558319, |
|
"learning_rate": 2.478710783021355e-07, |
|
"loss": 1.8609, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.08294706025859966, |
|
"eval_loss": 1.829516887664795, |
|
"eval_runtime": 156.5752, |
|
"eval_samples_per_second": 1.635, |
|
"eval_steps_per_second": 0.817, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.08319102220053672, |
|
"grad_norm": 0.40097326040267944, |
|
"learning_rate": 2.4786431001292156e-07, |
|
"loss": 1.7514, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.08343498414247377, |
|
"grad_norm": 0.39558151364326477, |
|
"learning_rate": 2.478575385102342e-07, |
|
"loss": 1.9019, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.08367894608441083, |
|
"grad_norm": 0.3937402367591858, |
|
"learning_rate": 2.4785076379178427e-07, |
|
"loss": 2.0703, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.08392290802634789, |
|
"grad_norm": 0.3737332820892334, |
|
"learning_rate": 2.478439858552805e-07, |
|
"loss": 1.8953, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.08416686996828494, |
|
"grad_norm": 0.3693140745162964, |
|
"learning_rate": 2.4783720469842943e-07, |
|
"loss": 1.8952, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.084410831910222, |
|
"grad_norm": 0.41011977195739746, |
|
"learning_rate": 2.4783042031893544e-07, |
|
"loss": 1.7306, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.08465479385215906, |
|
"grad_norm": 0.4407089352607727, |
|
"learning_rate": 2.478236327145007e-07, |
|
"loss": 1.8516, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.08489875579409611, |
|
"grad_norm": 0.4775758683681488, |
|
"learning_rate": 2.4781684188282526e-07, |
|
"loss": 1.8198, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.08514271773603319, |
|
"grad_norm": 0.37072694301605225, |
|
"learning_rate": 2.4781004782160693e-07, |
|
"loss": 1.9177, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.08538667967797024, |
|
"grad_norm": 0.3914446532726288, |
|
"learning_rate": 2.478032505285412e-07, |
|
"loss": 1.8334, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.08538667967797024, |
|
"eval_loss": 1.8291497230529785, |
|
"eval_runtime": 157.2832, |
|
"eval_samples_per_second": 1.628, |
|
"eval_steps_per_second": 0.814, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.0856306416199073, |
|
"grad_norm": 0.40111953020095825, |
|
"learning_rate": 2.4779645000132166e-07, |
|
"loss": 1.9745, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.08587460356184436, |
|
"grad_norm": 0.4218769967556, |
|
"learning_rate": 2.477896462376395e-07, |
|
"loss": 1.7767, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.08611856550378141, |
|
"grad_norm": 1.2748806476593018, |
|
"learning_rate": 2.4778283923518366e-07, |
|
"loss": 1.9835, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.08636252744571847, |
|
"grad_norm": 0.9254433512687683, |
|
"learning_rate": 2.477760289916411e-07, |
|
"loss": 1.8909, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.08660648938765553, |
|
"grad_norm": 1.155629277229309, |
|
"learning_rate": 2.477692155046964e-07, |
|
"loss": 2.0672, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.08685045132959258, |
|
"grad_norm": 0.6299034357070923, |
|
"learning_rate": 2.47762398772032e-07, |
|
"loss": 1.9787, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.08709441327152964, |
|
"grad_norm": 0.7239134907722473, |
|
"learning_rate": 2.4775557879132803e-07, |
|
"loss": 1.7728, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.0873383752134667, |
|
"grad_norm": 0.4112605154514313, |
|
"learning_rate": 2.4774875556026265e-07, |
|
"loss": 1.824, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.08758233715540376, |
|
"grad_norm": 0.4959578812122345, |
|
"learning_rate": 2.477419290765115e-07, |
|
"loss": 1.7778, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.08782629909734081, |
|
"grad_norm": 0.4753192961215973, |
|
"learning_rate": 2.4773509933774833e-07, |
|
"loss": 1.6845, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.08782629909734081, |
|
"eval_loss": 1.8272368907928467, |
|
"eval_runtime": 156.5455, |
|
"eval_samples_per_second": 1.635, |
|
"eval_steps_per_second": 0.818, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.08807026103927787, |
|
"grad_norm": 0.39284539222717285, |
|
"learning_rate": 2.4772826634164435e-07, |
|
"loss": 1.6858, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.08831422298121493, |
|
"grad_norm": 0.48466554284095764, |
|
"learning_rate": 2.4772143008586876e-07, |
|
"loss": 1.9059, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.08855818492315198, |
|
"grad_norm": 0.4809161424636841, |
|
"learning_rate": 2.4771459056808844e-07, |
|
"loss": 1.9083, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.08880214686508904, |
|
"grad_norm": 0.5406439900398254, |
|
"learning_rate": 2.477077477859681e-07, |
|
"loss": 1.8219, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.0890461088070261, |
|
"grad_norm": 0.5194385647773743, |
|
"learning_rate": 2.4770090173717014e-07, |
|
"loss": 1.7921, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.08929007074896317, |
|
"grad_norm": 0.412882536649704, |
|
"learning_rate": 2.4769405241935484e-07, |
|
"loss": 1.7941, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.08953403269090023, |
|
"grad_norm": 0.37151506543159485, |
|
"learning_rate": 2.476871998301802e-07, |
|
"loss": 1.7942, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.08977799463283728, |
|
"grad_norm": 0.4231220483779907, |
|
"learning_rate": 2.476803439673019e-07, |
|
"loss": 1.8722, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.09002195657477434, |
|
"grad_norm": 0.5867494344711304, |
|
"learning_rate": 2.476734848283735e-07, |
|
"loss": 1.9138, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.0902659185167114, |
|
"grad_norm": 0.3956262171268463, |
|
"learning_rate": 2.476666224110462e-07, |
|
"loss": 1.9813, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.0902659185167114, |
|
"eval_loss": 1.826444149017334, |
|
"eval_runtime": 157.275, |
|
"eval_samples_per_second": 1.628, |
|
"eval_steps_per_second": 0.814, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.09050988045864845, |
|
"grad_norm": 0.42614656686782837, |
|
"learning_rate": 2.476597567129691e-07, |
|
"loss": 1.7726, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.09075384240058551, |
|
"grad_norm": 0.47062888741493225, |
|
"learning_rate": 2.4765288773178894e-07, |
|
"loss": 1.8998, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.09099780434252257, |
|
"grad_norm": 0.43838515877723694, |
|
"learning_rate": 2.476460154651503e-07, |
|
"loss": 1.8538, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.09124176628445962, |
|
"grad_norm": 0.6669487357139587, |
|
"learning_rate": 2.4763913991069527e-07, |
|
"loss": 1.8683, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.09148572822639668, |
|
"grad_norm": 0.4067532420158386, |
|
"learning_rate": 2.4763226106606407e-07, |
|
"loss": 1.8279, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.09172969016833374, |
|
"grad_norm": 1.4081276655197144, |
|
"learning_rate": 2.476253789288943e-07, |
|
"loss": 1.6806, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.0919736521102708, |
|
"grad_norm": 0.5126282572746277, |
|
"learning_rate": 2.4761849349682154e-07, |
|
"loss": 1.7196, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.09221761405220785, |
|
"grad_norm": 0.47513243556022644, |
|
"learning_rate": 2.4761160476747895e-07, |
|
"loss": 1.7233, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.09246157599414491, |
|
"grad_norm": 0.5680952072143555, |
|
"learning_rate": 2.4760471273849755e-07, |
|
"loss": 1.9624, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.09270553793608197, |
|
"grad_norm": 0.4912157654762268, |
|
"learning_rate": 2.47597817407506e-07, |
|
"loss": 1.961, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.09270553793608197, |
|
"eval_loss": 1.8258123397827148, |
|
"eval_runtime": 156.3289, |
|
"eval_samples_per_second": 1.638, |
|
"eval_steps_per_second": 0.819, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.09294949987801902, |
|
"grad_norm": 0.5005534291267395, |
|
"learning_rate": 2.475909187721307e-07, |
|
"loss": 1.8626, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.09319346181995608, |
|
"grad_norm": 0.45611926913261414, |
|
"learning_rate": 2.4758401682999573e-07, |
|
"loss": 1.919, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.09343742376189315, |
|
"grad_norm": 0.5665335655212402, |
|
"learning_rate": 2.475771115787231e-07, |
|
"loss": 1.8476, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.09368138570383021, |
|
"grad_norm": 0.4179742634296417, |
|
"learning_rate": 2.475702030159322e-07, |
|
"loss": 1.7702, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.09392534764576727, |
|
"grad_norm": 0.44780439138412476, |
|
"learning_rate": 2.475632911392405e-07, |
|
"loss": 1.7905, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.09416930958770432, |
|
"grad_norm": 0.9271466732025146, |
|
"learning_rate": 2.475563759462629e-07, |
|
"loss": 1.976, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.09441327152964138, |
|
"grad_norm": 0.6895579099655151, |
|
"learning_rate": 2.475494574346122e-07, |
|
"loss": 1.9016, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.09465723347157844, |
|
"grad_norm": 0.4328395426273346, |
|
"learning_rate": 2.475425356018988e-07, |
|
"loss": 1.7875, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.0949011954135155, |
|
"grad_norm": 0.4196988344192505, |
|
"learning_rate": 2.475356104457307e-07, |
|
"loss": 1.7607, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.09514515735545255, |
|
"grad_norm": 0.4333524703979492, |
|
"learning_rate": 2.4752868196371393e-07, |
|
"loss": 1.9771, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.09514515735545255, |
|
"eval_loss": 1.8251597881317139, |
|
"eval_runtime": 157.0151, |
|
"eval_samples_per_second": 1.63, |
|
"eval_steps_per_second": 0.815, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.09538911929738961, |
|
"grad_norm": 0.6076596975326538, |
|
"learning_rate": 2.47521750153452e-07, |
|
"loss": 2.1356, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.09563308123932666, |
|
"grad_norm": 0.43572092056274414, |
|
"learning_rate": 2.4751481501254606e-07, |
|
"loss": 1.9217, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.09587704318126372, |
|
"grad_norm": 23.73161506652832, |
|
"learning_rate": 2.4750787653859505e-07, |
|
"loss": 2.1093, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.09612100512320078, |
|
"grad_norm": 0.46901410818099976, |
|
"learning_rate": 2.475009347291956e-07, |
|
"loss": 1.9877, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.09636496706513784, |
|
"grad_norm": 0.4053335189819336, |
|
"learning_rate": 2.47493989581942e-07, |
|
"loss": 1.9272, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.09660892900707489, |
|
"grad_norm": 0.4614839255809784, |
|
"learning_rate": 2.4748704109442635e-07, |
|
"loss": 1.885, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.09685289094901195, |
|
"grad_norm": 0.4277932047843933, |
|
"learning_rate": 2.4748008926423817e-07, |
|
"loss": 1.808, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.097096852890949, |
|
"grad_norm": 0.41171425580978394, |
|
"learning_rate": 2.474731340889649e-07, |
|
"loss": 1.928, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.09734081483288606, |
|
"grad_norm": 0.41549429297447205, |
|
"learning_rate": 2.4746617556619163e-07, |
|
"loss": 1.7844, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.09758477677482313, |
|
"grad_norm": 0.4279956817626953, |
|
"learning_rate": 2.4745921369350094e-07, |
|
"loss": 1.9173, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.09758477677482313, |
|
"eval_loss": 1.823663353919983, |
|
"eval_runtime": 157.0142, |
|
"eval_samples_per_second": 1.63, |
|
"eval_steps_per_second": 0.815, |
|
"step": 400 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 4099, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 10, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.0291845984681984e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|