{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 8.527180783817952, "eval_steps": 500, "global_step": 6745, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012642225031605562, "grad_norm": 0.06671903282403946, "learning_rate": 9e-06, "loss": 0.6431, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 10 }, { "epoch": 0.025284450063211124, "grad_norm": 0.08679291605949402, "learning_rate": 1.9e-05, "loss": 0.6499, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 20 }, { "epoch": 0.03792667509481669, "grad_norm": 0.07710310071706772, "learning_rate": 2.9e-05, "loss": 0.625, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 30 }, { "epoch": 0.05056890012642225, "grad_norm": 0.08816391229629517, "learning_rate": 3.9000000000000006e-05, "loss": 0.6109, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 40 }, { "epoch": 0.0632111251580278, "grad_norm": 0.1400187462568283, "learning_rate": 4.9e-05, "loss": 0.6043, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 50 }, { "epoch": 0.07585335018963338, "grad_norm": 0.08173350989818573, "learning_rate": 5.9e-05, "loss": 0.6308, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 60 }, { "epoch": 0.08849557522123894, "grad_norm": 0.09538205713033676, "learning_rate": 6.9e-05, "loss": 0.6078, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 70 }, { "epoch": 0.1011378002528445, "grad_norm": 0.10508744418621063, "learning_rate": 7.900000000000001e-05, "loss": 0.6266, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 80 }, { "epoch": 0.11378002528445007, "grad_norm": 0.13323046267032623, "learning_rate": 8.900000000000001e-05, "loss": 0.6393, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 90 }, { "epoch": 0.1264222503160556, "grad_norm": 0.13296917080879211, "learning_rate": 9.900000000000001e-05, "loss": 0.6361, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 100 }, { "epoch": 0.1390644753476612, "grad_norm": 0.14028862118721008, "learning_rate": 0.000109, "loss": 0.6576, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 110 }, { "epoch": 0.15170670037926676, "grad_norm": 0.17391778528690338, "learning_rate": 0.000119, "loss": 0.6084, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 120 }, { "epoch": 0.16434892541087232, "grad_norm": 0.1644161194562912, "learning_rate": 0.00012900000000000002, "loss": 0.658, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 130 }, { "epoch": 0.17699115044247787, "grad_norm": 0.2195376306772232, "learning_rate": 0.000139, "loss": 0.6274, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 140 }, { "epoch": 0.18963337547408343, "grad_norm": 0.2098621428012848, "learning_rate": 0.00014900000000000002, "loss": 0.5906, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 150 }, { "epoch": 0.202275600505689, "grad_norm": 0.24007147550582886, "learning_rate": 0.00015900000000000002, "loss": 0.6373, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 160 }, { "epoch": 0.21491782553729458, "grad_norm": 0.2894239127635956, "learning_rate": 0.00016900000000000002, "loss": 0.6371, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 170 }, { "epoch": 0.22756005056890014, "grad_norm": 0.23595209419727325, "learning_rate": 0.00017900000000000001, "loss": 0.6768, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 180 }, { "epoch": 0.2402022756005057, "grad_norm": 0.26761606335639954, "learning_rate": 0.00018899999999999999, "loss": 0.6431, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 190 }, { "epoch": 0.2528445006321112, "grad_norm": 0.2602802813053131, "learning_rate": 0.000199, "loss": 0.6443, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 200 }, { "epoch": 0.26548672566371684, "grad_norm": 0.4167614281177521, "learning_rate": 0.00019999934682007068, "loss": 0.6589, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 210 }, { "epoch": 0.2781289506953224, "grad_norm": 0.3008961081504822, "learning_rate": 0.00019999708892979201, "loss": 0.6986, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 220 }, { "epoch": 0.29077117572692796, "grad_norm": 0.5571665167808533, "learning_rate": 0.00019999321831722333, "loss": 0.6971, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 230 }, { "epoch": 0.3034134007585335, "grad_norm": 0.3433665335178375, "learning_rate": 0.000199987735071543, "loss": 0.6764, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 240 }, { "epoch": 0.31605562579013907, "grad_norm": 0.36731716990470886, "learning_rate": 0.00019998063931908407, "loss": 0.6622, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 250 }, { "epoch": 0.32869785082174463, "grad_norm": 0.4063098132610321, "learning_rate": 0.0001999719312233317, "loss": 0.6727, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 260 }, { "epoch": 0.3413400758533502, "grad_norm": 0.4796123504638672, "learning_rate": 0.0001999616109849191, "loss": 0.6683, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 270 }, { "epoch": 0.35398230088495575, "grad_norm": 0.35787850618362427, "learning_rate": 0.00019994967884162285, "loss": 0.6856, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 280 }, { "epoch": 0.3666245259165613, "grad_norm": 2.5480282306671143, "learning_rate": 0.00019993613506835787, "loss": 0.7337, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 290 }, { "epoch": 0.37926675094816686, "grad_norm": 132.1597137451172, "learning_rate": 0.00019992097997717054, "loss": 1.748, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 300 }, { "epoch": 0.3919089759797724, "grad_norm": 138.1280975341797, "learning_rate": 0.00019990421391723193, "loss": 2.0475, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 310 }, { "epoch": 0.404551201011378, "grad_norm": 11.301629066467285, "learning_rate": 0.00019988583727482948, "loss": 1.949, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 320 }, { "epoch": 0.41719342604298354, "grad_norm": 249.65650939941406, "learning_rate": 0.0001998658504733583, "loss": 1.3181, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 330 }, { "epoch": 0.42983565107458915, "grad_norm": 9.794897079467773, "learning_rate": 0.0001998442539733111, "loss": 1.588, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 340 }, { "epoch": 0.4424778761061947, "grad_norm": 1.245341181755066, "learning_rate": 0.00019982104827226808, "loss": 0.8035, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 350 }, { "epoch": 0.45512010113780027, "grad_norm": 1.8496769666671753, "learning_rate": 0.00019979623390488507, "loss": 0.7647, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 360 }, { "epoch": 0.46776232616940583, "grad_norm": 16.008167266845703, "learning_rate": 0.0001997698114428813, "loss": 0.7563, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 370 }, { "epoch": 0.4804045512010114, "grad_norm": 6.476025104522705, "learning_rate": 0.00019974178149502624, "loss": 0.8981, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 380 }, { "epoch": 0.49304677623261695, "grad_norm": 1.4814890623092651, "learning_rate": 0.0001997121447071257, "loss": 0.7543, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 390 }, { "epoch": 0.5056890012642224, "grad_norm": 0.695743978023529, "learning_rate": 0.0001996809017620067, "loss": 0.7414, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 400 }, { "epoch": 0.5183312262958281, "grad_norm": 9.911474227905273, "learning_rate": 0.000199648053379502, "loss": 0.7957, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 410 }, { "epoch": 0.5309734513274337, "grad_norm": 0.726256787776947, "learning_rate": 0.00019961360031643332, "loss": 0.7185, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 420 }, { "epoch": 0.5436156763590392, "grad_norm": 165.10116577148438, "learning_rate": 0.00019957754336659392, "loss": 0.901, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 430 }, { "epoch": 0.5562579013906448, "grad_norm": 41.61799621582031, "learning_rate": 0.0001995398833607306, "loss": 1.6791, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 440 }, { "epoch": 0.5689001264222503, "grad_norm": 26.158023834228516, "learning_rate": 0.0001995006211665241, "loss": 1.6933, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 450 }, { "epoch": 0.5815423514538559, "grad_norm": 1.6820884943008423, "learning_rate": 0.00019945975768856936, "loss": 0.8144, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 460 }, { "epoch": 0.5941845764854614, "grad_norm": 0.5475680828094482, "learning_rate": 0.00019941729386835472, "loss": 0.7117, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 470 }, { "epoch": 0.606826801517067, "grad_norm": 0.5968815684318542, "learning_rate": 0.0001993732306842402, "loss": 0.7452, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 480 }, { "epoch": 0.6194690265486725, "grad_norm": 3.038395404815674, "learning_rate": 0.00019932756915143481, "loss": 0.7365, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 490 }, { "epoch": 0.6321112515802781, "grad_norm": 1.1817647218704224, "learning_rate": 0.0001992803103219733, "loss": 0.7883, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 500 }, { "epoch": 0.6447534766118836, "grad_norm": 0.8520786166191101, "learning_rate": 0.00019923145528469202, "loss": 0.7309, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 510 }, { "epoch": 0.6573957016434893, "grad_norm": 0.9001318216323853, "learning_rate": 0.00019918100516520354, "loss": 0.8017, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 520 }, { "epoch": 0.6700379266750948, "grad_norm": 1.4401612281799316, "learning_rate": 0.00019912896112587092, "loss": 0.7814, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 530 }, { "epoch": 0.6826801517067004, "grad_norm": 34.98484420776367, "learning_rate": 0.00019907532436578098, "loss": 0.8461, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 540 }, { "epoch": 0.695322376738306, "grad_norm": 1.0898733139038086, "learning_rate": 0.00019902009612071645, "loss": 0.9027, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 550 }, { "epoch": 0.7079646017699115, "grad_norm": 38.014892578125, "learning_rate": 0.00019896327766312773, "loss": 0.8073, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 560 }, { "epoch": 0.7206068268015171, "grad_norm": 2.3141884803771973, "learning_rate": 0.0001989048703021035, "loss": 0.7773, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 570 }, { "epoch": 0.7332490518331226, "grad_norm": 0.8214466571807861, "learning_rate": 0.00019884487538334038, "loss": 0.8214, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 580 }, { "epoch": 0.7458912768647282, "grad_norm": 1.5186419486999512, "learning_rate": 0.00019878329428911227, "loss": 0.7852, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 590 }, { "epoch": 0.7585335018963337, "grad_norm": 1.7092262506484985, "learning_rate": 0.00019872012843823815, "loss": 0.8048, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 600 }, { "epoch": 0.7711757269279393, "grad_norm": 1.04222571849823, "learning_rate": 0.00019865537928604967, "loss": 0.7578, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 610 }, { "epoch": 0.7838179519595448, "grad_norm": 6.094375133514404, "learning_rate": 0.00019858904832435745, "loss": 0.8016, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 620 }, { "epoch": 0.7964601769911505, "grad_norm": 35.207767486572266, "learning_rate": 0.00019852113708141675, "loss": 0.8785, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 630 }, { "epoch": 0.809102402022756, "grad_norm": 2.344693422317505, "learning_rate": 0.00019845164712189233, "loss": 0.8803, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 640 }, { "epoch": 0.8217446270543616, "grad_norm": 3.754660129547119, "learning_rate": 0.00019838058004682224, "loss": 0.7746, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 650 }, { "epoch": 0.8343868520859671, "grad_norm": 0.9116389155387878, "learning_rate": 0.0001983079374935811, "loss": 0.756, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 660 }, { "epoch": 0.8470290771175727, "grad_norm": 2.610806703567505, "learning_rate": 0.0001982337211358423, "loss": 0.7773, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 670 }, { "epoch": 0.8596713021491783, "grad_norm": 10.215978622436523, "learning_rate": 0.00019815793268353944, "loss": 0.7995, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 680 }, { "epoch": 0.8723135271807838, "grad_norm": 1.8970898389816284, "learning_rate": 0.000198080573882827, "loss": 0.7576, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 690 }, { "epoch": 0.8849557522123894, "grad_norm": 4.906523704528809, "learning_rate": 0.00019800164651603987, "loss": 0.8217, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 700 }, { "epoch": 0.8975979772439949, "grad_norm": 44.493133544921875, "learning_rate": 0.0001979211524016527, "loss": 0.8068, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 710 }, { "epoch": 0.9102402022756005, "grad_norm": 348.29559326171875, "learning_rate": 0.00019783909339423758, "loss": 2.0656, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 720 }, { "epoch": 0.922882427307206, "grad_norm": 8.599038124084473, "learning_rate": 0.00019775547138442157, "loss": 1.7869, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 730 }, { "epoch": 0.9355246523388117, "grad_norm": 0.8781918287277222, "learning_rate": 0.00019767028829884313, "loss": 0.7973, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 740 }, { "epoch": 0.9481668773704172, "grad_norm": 1.5494831800460815, "learning_rate": 0.00019758354610010753, "loss": 0.7974, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 750 }, { "epoch": 0.9608091024020228, "grad_norm": 1.064113736152649, "learning_rate": 0.00019749524678674193, "loss": 0.8408, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 760 }, { "epoch": 0.9734513274336283, "grad_norm": 1.0886831283569336, "learning_rate": 0.00019740539239314898, "loss": 0.7968, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 770 }, { "epoch": 0.9860935524652339, "grad_norm": 1.2727103233337402, "learning_rate": 0.00019731398498956036, "loss": 0.7508, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 780 }, { "epoch": 0.9987357774968394, "grad_norm": 5.564798831939697, "learning_rate": 0.00019722102668198868, "loss": 0.7573, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 790 }, { "epoch": 1.011378002528445, "grad_norm": 0.7210967540740967, "learning_rate": 0.0001971265196121792, "loss": 0.6808, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 800 }, { "epoch": 1.0240202275600505, "grad_norm": 0.7716237306594849, "learning_rate": 0.00019703046595756054, "loss": 0.6197, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 810 }, { "epoch": 1.0366624525916561, "grad_norm": 0.8090146780014038, "learning_rate": 0.00019693286793119423, "loss": 0.6632, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 820 }, { "epoch": 1.0493046776232617, "grad_norm": 18.184663772583008, "learning_rate": 0.000196833727781724, "loss": 0.6638, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 830 }, { "epoch": 1.0619469026548674, "grad_norm": 0.7779182195663452, "learning_rate": 0.0001967330477933238, "loss": 0.6878, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 840 }, { "epoch": 1.0745891276864727, "grad_norm": 0.71025151014328, "learning_rate": 0.00019663083028564527, "loss": 0.6778, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 850 }, { "epoch": 1.0872313527180784, "grad_norm": 2.1243929862976074, "learning_rate": 0.0001965270776137642, "loss": 0.7326, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 860 }, { "epoch": 1.099873577749684, "grad_norm": 0.8968414068222046, "learning_rate": 0.0001964217921681265, "loss": 0.6792, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 870 }, { "epoch": 1.1125158027812896, "grad_norm": 5.795286655426025, "learning_rate": 0.00019631497637449274, "loss": 0.7356, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 880 }, { "epoch": 1.125158027812895, "grad_norm": 1.2587428092956543, "learning_rate": 0.0001962066326938826, "loss": 0.7505, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 890 }, { "epoch": 1.1378002528445006, "grad_norm": 1.1835522651672363, "learning_rate": 0.000196096763622518, "loss": 0.7025, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 900 }, { "epoch": 1.1504424778761062, "grad_norm": 36.68544387817383, "learning_rate": 0.00019598537169176564, "loss": 0.6972, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 910 }, { "epoch": 1.1630847029077118, "grad_norm": 877.5889892578125, "learning_rate": 0.0001958724594680787, "loss": 1.3335, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 920 }, { "epoch": 1.1757269279393174, "grad_norm": 536.0718383789062, "learning_rate": 0.00019575802955293763, "loss": 8.5679, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 930 }, { "epoch": 1.1883691529709228, "grad_norm": 22.20748519897461, "learning_rate": 0.00019564208458279034, "loss": 10.4269, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 940 }, { "epoch": 1.2010113780025284, "grad_norm": 58.90277862548828, "learning_rate": 0.00019552462722899122, "loss": 7.7899, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 950 }, { "epoch": 1.213653603034134, "grad_norm": 38.07368469238281, "learning_rate": 0.00019540566019773996, "loss": 8.0968, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 960 }, { "epoch": 1.2262958280657394, "grad_norm": 3.39542555809021, "learning_rate": 0.00019528518623001878, "loss": 7.6929, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 970 }, { "epoch": 1.238938053097345, "grad_norm": 8.227216720581055, "learning_rate": 0.0001951632081015296, "loss": 7.5381, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 980 }, { "epoch": 1.2515802781289507, "grad_norm": 15.710060119628906, "learning_rate": 0.00019503972862263002, "loss": 7.471, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 990 }, { "epoch": 1.2642225031605563, "grad_norm": 14.865936279296875, "learning_rate": 0.00019491475063826842, "loss": 7.1013, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1000 }, { "epoch": 1.276864728192162, "grad_norm": 11.446512222290039, "learning_rate": 0.00019478827702791858, "loss": 7.1631, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1010 }, { "epoch": 1.2895069532237673, "grad_norm": 4.988636016845703, "learning_rate": 0.00019466031070551325, "loss": 6.9726, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1020 }, { "epoch": 1.302149178255373, "grad_norm": 6.260726451873779, "learning_rate": 0.00019453085461937705, "loss": 6.8037, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1030 }, { "epoch": 1.3147914032869785, "grad_norm": 13.696749687194824, "learning_rate": 0.00019439991175215857, "loss": 6.823, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1040 }, { "epoch": 1.3274336283185841, "grad_norm": 4.492152690887451, "learning_rate": 0.0001942674851207615, "loss": 6.694, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1050 }, { "epoch": 1.3400758533501897, "grad_norm": 14.445012092590332, "learning_rate": 0.00019413357777627534, "loss": 6.5831, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1060 }, { "epoch": 1.3527180783817951, "grad_norm": 187.8795623779297, "learning_rate": 0.00019399819280390492, "loss": 6.4136, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1070 }, { "epoch": 1.3653603034134008, "grad_norm": 40.901546478271484, "learning_rate": 0.00019386133332289948, "loss": 5.5392, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1080 }, { "epoch": 1.3780025284450064, "grad_norm": 7.47464656829834, "learning_rate": 0.00019372300248648064, "loss": 2.8936, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1090 }, { "epoch": 1.3906447534766118, "grad_norm": 2.4701592922210693, "learning_rate": 0.00019358320348176978, "loss": 0.7719, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1100 }, { "epoch": 1.4032869785082174, "grad_norm": 1.2766318321228027, "learning_rate": 0.00019344193952971486, "loss": 0.7532, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1110 }, { "epoch": 1.415929203539823, "grad_norm": 1.149214744567871, "learning_rate": 0.00019329921388501573, "loss": 0.7712, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1120 }, { "epoch": 1.4285714285714286, "grad_norm": 3.015934944152832, "learning_rate": 0.0001931550298360496, "loss": 0.7567, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1130 }, { "epoch": 1.4412136536030342, "grad_norm": 0.9380026459693909, "learning_rate": 0.00019300939070479508, "loss": 0.7604, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1140 }, { "epoch": 1.4538558786346396, "grad_norm": 1.0415725708007812, "learning_rate": 0.00019286229984675558, "loss": 0.7313, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1150 }, { "epoch": 1.4664981036662452, "grad_norm": 1.5267181396484375, "learning_rate": 0.0001927137606508821, "loss": 0.7071, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1160 }, { "epoch": 1.4791403286978508, "grad_norm": 1.7632757425308228, "learning_rate": 0.00019256377653949515, "loss": 0.898, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1170 }, { "epoch": 1.4917825537294565, "grad_norm": 2.4399545192718506, "learning_rate": 0.00019241235096820587, "loss": 0.7592, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1180 }, { "epoch": 1.504424778761062, "grad_norm": 3.498751163482666, "learning_rate": 0.00019225948742583642, "loss": 0.8975, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1190 }, { "epoch": 1.5170670037926675, "grad_norm": 5.950336456298828, "learning_rate": 0.00019210518943433953, "loss": 0.7509, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1200 }, { "epoch": 1.529709228824273, "grad_norm": 13.960210800170898, "learning_rate": 0.00019194946054871753, "loss": 0.7932, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1210 }, { "epoch": 1.5423514538558787, "grad_norm": 2.3052141666412354, "learning_rate": 0.0001917923043569403, "loss": 0.9414, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1220 }, { "epoch": 1.554993678887484, "grad_norm": 1.8680328130722046, "learning_rate": 0.0001916337244798625, "loss": 0.7507, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1230 }, { "epoch": 1.56763590391909, "grad_norm": 1.9041931629180908, "learning_rate": 0.00019147372457114045, "loss": 0.7368, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1240 }, { "epoch": 1.5802781289506953, "grad_norm": 1.2977467775344849, "learning_rate": 0.00019131230831714776, "loss": 0.8548, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1250 }, { "epoch": 1.592920353982301, "grad_norm": 1.5000057220458984, "learning_rate": 0.00019114947943689036, "loss": 0.7704, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1260 }, { "epoch": 1.6055625790139065, "grad_norm": 3.4347245693206787, "learning_rate": 0.00019098524168192094, "loss": 0.7786, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1270 }, { "epoch": 1.618204804045512, "grad_norm": 2.482739210128784, "learning_rate": 0.00019081959883625235, "loss": 0.7569, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1280 }, { "epoch": 1.6308470290771175, "grad_norm": 1.322037696838379, "learning_rate": 0.00019065255471627062, "loss": 0.7714, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1290 }, { "epoch": 1.6434892541087232, "grad_norm": 1.073613166809082, "learning_rate": 0.00019048411317064683, "loss": 0.742, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1300 }, { "epoch": 1.6561314791403285, "grad_norm": 0.8305187225341797, "learning_rate": 0.00019031427808024866, "loss": 0.7216, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1310 }, { "epoch": 1.6687737041719344, "grad_norm": 1.1198879480361938, "learning_rate": 0.0001901430533580508, "loss": 0.7477, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1320 }, { "epoch": 1.6814159292035398, "grad_norm": 0.9730642437934875, "learning_rate": 0.0001899704429490447, "loss": 0.765, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1330 }, { "epoch": 1.6940581542351454, "grad_norm": 1.542136311531067, "learning_rate": 0.00018979645083014809, "loss": 0.7338, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1340 }, { "epoch": 1.706700379266751, "grad_norm": 1.3562628030776978, "learning_rate": 0.00018962108101011285, "loss": 0.7786, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1350 }, { "epoch": 1.7193426042983564, "grad_norm": 1.8742653131484985, "learning_rate": 0.000189444337529433, "loss": 0.7812, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1360 }, { "epoch": 1.7319848293299622, "grad_norm": 3.535946846008301, "learning_rate": 0.0001892662244602515, "loss": 0.7653, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1370 }, { "epoch": 1.7446270543615676, "grad_norm": 0.9589079022407532, "learning_rate": 0.00018908674590626637, "loss": 0.8217, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1380 }, { "epoch": 1.7572692793931732, "grad_norm": 2.254733085632324, "learning_rate": 0.00018890590600263618, "loss": 0.7535, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1390 }, { "epoch": 1.7699115044247788, "grad_norm": 0.8984112739562988, "learning_rate": 0.00018872370891588491, "loss": 0.7839, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1400 }, { "epoch": 1.7825537294563842, "grad_norm": 1.0168917179107666, "learning_rate": 0.00018854015884380568, "loss": 0.7443, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1410 }, { "epoch": 1.7951959544879899, "grad_norm": 0.9075338840484619, "learning_rate": 0.00018835526001536424, "loss": 0.7515, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1420 }, { "epoch": 1.8078381795195955, "grad_norm": 0.9690259695053101, "learning_rate": 0.00018816901669060156, "loss": 0.8179, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1430 }, { "epoch": 1.8204804045512009, "grad_norm": 1.043910026550293, "learning_rate": 0.0001879814331605355, "loss": 0.8088, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1440 }, { "epoch": 1.8331226295828067, "grad_norm": 0.9964724779129028, "learning_rate": 0.00018779251374706206, "loss": 0.7603, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1450 }, { "epoch": 1.845764854614412, "grad_norm": 0.896278440952301, "learning_rate": 0.00018760226280285585, "loss": 0.7666, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1460 }, { "epoch": 1.8584070796460177, "grad_norm": 1.0416340827941895, "learning_rate": 0.00018741068471126967, "loss": 0.7295, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1470 }, { "epoch": 1.8710493046776233, "grad_norm": 1.1354191303253174, "learning_rate": 0.00018721778388623367, "loss": 0.7552, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1480 }, { "epoch": 1.8836915297092287, "grad_norm": 1.0638015270233154, "learning_rate": 0.00018702356477215352, "loss": 0.7663, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1490 }, { "epoch": 1.8963337547408345, "grad_norm": 0.9380121231079102, "learning_rate": 0.00018682803184380807, "loss": 0.7436, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1500 }, { "epoch": 1.90897597977244, "grad_norm": 0.9272292256355286, "learning_rate": 0.0001866311896062463, "loss": 0.8219, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1510 }, { "epoch": 1.9216182048040455, "grad_norm": 0.9718897938728333, "learning_rate": 0.00018643304259468346, "loss": 0.7357, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1520 }, { "epoch": 1.9342604298356512, "grad_norm": 0.8963416218757629, "learning_rate": 0.00018623359537439654, "loss": 0.7421, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1530 }, { "epoch": 1.9469026548672566, "grad_norm": 0.8436943888664246, "learning_rate": 0.0001860328525406192, "loss": 0.8123, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1540 }, { "epoch": 1.9595448798988622, "grad_norm": 0.9509057998657227, "learning_rate": 0.00018583081871843585, "loss": 0.785, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1550 }, { "epoch": 1.9721871049304678, "grad_norm": 1.5439331531524658, "learning_rate": 0.00018562749856267495, "loss": 0.7564, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1560 }, { "epoch": 1.9848293299620732, "grad_norm": 1.1488640308380127, "learning_rate": 0.00018542289675780208, "loss": 0.7905, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1570 }, { "epoch": 1.997471554993679, "grad_norm": 0.8320059776306152, "learning_rate": 0.00018521701801781172, "loss": 0.7636, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1580 }, { "epoch": 2.0101137800252844, "grad_norm": 24.31561851501465, "learning_rate": 0.00018500986708611868, "loss": 0.7004, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1590 }, { "epoch": 2.02275600505689, "grad_norm": 0.9889429211616516, "learning_rate": 0.00018480144873544898, "loss": 0.6018, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1600 }, { "epoch": 2.0353982300884956, "grad_norm": 0.852366030216217, "learning_rate": 0.0001845917677677298, "loss": 0.6404, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1610 }, { "epoch": 2.048040455120101, "grad_norm": 0.8665163516998291, "learning_rate": 0.00018438082901397866, "loss": 0.6277, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1620 }, { "epoch": 2.060682680151707, "grad_norm": 0.959322452545166, "learning_rate": 0.00018416863733419246, "loss": 0.6274, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1630 }, { "epoch": 2.0733249051833123, "grad_norm": 0.7421912550926208, "learning_rate": 0.0001839551976172352, "loss": 0.6483, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1640 }, { "epoch": 2.0859671302149176, "grad_norm": 1.5782485008239746, "learning_rate": 0.0001837405147807256, "loss": 0.6964, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1650 }, { "epoch": 2.0986093552465235, "grad_norm": 0.793574869632721, "learning_rate": 0.00018352459377092347, "loss": 0.6323, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1660 }, { "epoch": 2.111251580278129, "grad_norm": 0.7756363153457642, "learning_rate": 0.00018330743956261616, "loss": 0.6988, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1670 }, { "epoch": 2.1238938053097347, "grad_norm": 0.8382811546325684, "learning_rate": 0.0001830890571590036, "loss": 0.6159, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1680 }, { "epoch": 2.13653603034134, "grad_norm": 1.7289704084396362, "learning_rate": 0.0001828694515915831, "loss": 0.655, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1690 }, { "epoch": 2.1491782553729455, "grad_norm": 0.8287073373794556, "learning_rate": 0.00018264862792003367, "loss": 0.6869, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1700 }, { "epoch": 2.1618204804045513, "grad_norm": 0.8839928507804871, "learning_rate": 0.00018242659123209905, "loss": 0.6807, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1710 }, { "epoch": 2.1744627054361567, "grad_norm": 0.9569761753082275, "learning_rate": 0.0001822033466434708, "loss": 0.6826, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1720 }, { "epoch": 2.187104930467762, "grad_norm": 1.1782281398773193, "learning_rate": 0.00018197889929767036, "loss": 0.6532, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1730 }, { "epoch": 2.199747155499368, "grad_norm": 4.368149280548096, "learning_rate": 0.00018175325436593044, "loss": 0.6681, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1740 }, { "epoch": 2.2123893805309733, "grad_norm": 0.9262805581092834, "learning_rate": 0.00018152641704707593, "loss": 0.6776, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1750 }, { "epoch": 2.225031605562579, "grad_norm": 4.026210784912109, "learning_rate": 0.0001812983925674042, "loss": 0.6965, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1760 }, { "epoch": 2.2376738305941846, "grad_norm": 0.9288873076438904, "learning_rate": 0.00018106918618056463, "loss": 0.7156, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1770 }, { "epoch": 2.25031605562579, "grad_norm": 0.9781466126441956, "learning_rate": 0.00018083880316743757, "loss": 0.6843, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1780 }, { "epoch": 2.262958280657396, "grad_norm": 0.8335726857185364, "learning_rate": 0.00018060724883601248, "loss": 0.6722, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1790 }, { "epoch": 2.275600505689001, "grad_norm": 0.8793342709541321, "learning_rate": 0.00018037452852126613, "loss": 0.649, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1800 }, { "epoch": 2.288242730720607, "grad_norm": 0.868864893913269, "learning_rate": 0.00018014064758503908, "loss": 0.6749, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1810 }, { "epoch": 2.3008849557522124, "grad_norm": 0.8861690759658813, "learning_rate": 0.00017990561141591264, "loss": 0.6893, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1820 }, { "epoch": 2.313527180783818, "grad_norm": 0.8054774403572083, "learning_rate": 0.00017966942542908435, "loss": 0.7254, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1830 }, { "epoch": 2.3261694058154236, "grad_norm": 0.9192434549331665, "learning_rate": 0.0001794320950662435, "loss": 0.7071, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1840 }, { "epoch": 2.338811630847029, "grad_norm": 1.0894279479980469, "learning_rate": 0.0001791936257954456, "loss": 0.6882, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1850 }, { "epoch": 2.351453855878635, "grad_norm": 0.976393461227417, "learning_rate": 0.0001789540231109863, "loss": 0.6996, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1860 }, { "epoch": 2.3640960809102403, "grad_norm": 1.0295621156692505, "learning_rate": 0.0001787132925332751, "loss": 0.7212, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1870 }, { "epoch": 2.3767383059418457, "grad_norm": 0.9011755585670471, "learning_rate": 0.00017847143960870792, "loss": 0.6803, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1880 }, { "epoch": 2.3893805309734515, "grad_norm": 0.9422768354415894, "learning_rate": 0.00017822846990953942, "loss": 0.7172, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1890 }, { "epoch": 2.402022756005057, "grad_norm": 0.976975679397583, "learning_rate": 0.00017798438903375452, "loss": 0.6627, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1900 }, { "epoch": 2.4146649810366623, "grad_norm": 0.8325662016868591, "learning_rate": 0.00017773920260493942, "loss": 0.6819, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1910 }, { "epoch": 2.427307206068268, "grad_norm": 0.9316614866256714, "learning_rate": 0.00017749291627215224, "loss": 0.6842, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1920 }, { "epoch": 2.4399494310998735, "grad_norm": 0.8595056533813477, "learning_rate": 0.0001772455357097927, "loss": 0.7084, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1930 }, { "epoch": 2.452591656131479, "grad_norm": 0.834000825881958, "learning_rate": 0.00017699706661747125, "loss": 0.6951, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1940 }, { "epoch": 2.4652338811630847, "grad_norm": 0.7746726274490356, "learning_rate": 0.0001767475147198781, "loss": 0.7076, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1950 }, { "epoch": 2.47787610619469, "grad_norm": 5.648841857910156, "learning_rate": 0.00017649688576665094, "loss": 0.6874, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1960 }, { "epoch": 2.490518331226296, "grad_norm": 0.8709747195243835, "learning_rate": 0.00017624518553224295, "loss": 0.7033, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1970 }, { "epoch": 2.5031605562579013, "grad_norm": 1.2027637958526611, "learning_rate": 0.00017599241981578904, "loss": 0.6945, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1980 }, { "epoch": 2.5158027812895067, "grad_norm": 0.866089403629303, "learning_rate": 0.00017573859444097308, "loss": 0.6611, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 1990 }, { "epoch": 2.5284450063211126, "grad_norm": 2.6877481937408447, "learning_rate": 0.00017548371525589302, "loss": 0.6922, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2000 }, { "epoch": 2.541087231352718, "grad_norm": 1.8271033763885498, "learning_rate": 0.0001752277881329266, "loss": 0.7011, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2010 }, { "epoch": 2.553729456384324, "grad_norm": 3.121169328689575, "learning_rate": 0.0001749708189685958, "loss": 0.7012, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2020 }, { "epoch": 2.566371681415929, "grad_norm": 1.5094399452209473, "learning_rate": 0.00017471281368343114, "loss": 0.6682, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2030 }, { "epoch": 2.5790139064475346, "grad_norm": 1.1823444366455078, "learning_rate": 0.00017445377822183518, "loss": 0.6828, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2040 }, { "epoch": 2.5916561314791404, "grad_norm": 2.187333106994629, "learning_rate": 0.00017419371855194551, "loss": 0.65, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2050 }, { "epoch": 2.604298356510746, "grad_norm": 1.0692399740219116, "learning_rate": 0.00017393264066549753, "loss": 0.6652, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2060 }, { "epoch": 2.6169405815423517, "grad_norm": 0.8324422240257263, "learning_rate": 0.00017367055057768588, "loss": 0.6999, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2070 }, { "epoch": 2.629582806573957, "grad_norm": 0.9880168437957764, "learning_rate": 0.00017340745432702654, "loss": 0.6859, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2080 }, { "epoch": 2.6422250316055624, "grad_norm": 2.551191568374634, "learning_rate": 0.00017314335797521705, "loss": 0.6948, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2090 }, { "epoch": 2.6548672566371683, "grad_norm": 0.9405047297477722, "learning_rate": 0.0001728782676069972, "loss": 0.6906, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2100 }, { "epoch": 2.6675094816687737, "grad_norm": 5.015996932983398, "learning_rate": 0.00017261218933000878, "loss": 0.6867, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2110 }, { "epoch": 2.6801517067003795, "grad_norm": 0.932569682598114, "learning_rate": 0.00017234512927465488, "loss": 0.7304, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2120 }, { "epoch": 2.692793931731985, "grad_norm": 1.071932315826416, "learning_rate": 0.0001720770935939586, "loss": 0.7261, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2130 }, { "epoch": 2.7054361567635903, "grad_norm": 0.8238343596458435, "learning_rate": 0.00017180808846342118, "loss": 0.7313, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2140 }, { "epoch": 2.718078381795196, "grad_norm": 1.3495972156524658, "learning_rate": 0.0001715381200808801, "loss": 0.7418, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2150 }, { "epoch": 2.7307206068268015, "grad_norm": 0.8959026336669922, "learning_rate": 0.00017126719466636572, "loss": 0.6729, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2160 }, { "epoch": 2.7433628318584073, "grad_norm": 0.8978679180145264, "learning_rate": 0.0001709953184619585, "loss": 0.7, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2170 }, { "epoch": 2.7560050568900127, "grad_norm": 1.033858060836792, "learning_rate": 0.00017072249773164485, "loss": 0.7142, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2180 }, { "epoch": 2.768647281921618, "grad_norm": 0.9381289482116699, "learning_rate": 0.0001704487387611729, "loss": 0.7362, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2190 }, { "epoch": 2.7812895069532235, "grad_norm": 1.0184166431427002, "learning_rate": 0.00017017404785790773, "loss": 0.7133, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2200 }, { "epoch": 2.7939317319848294, "grad_norm": 0.9085473418235779, "learning_rate": 0.00016989843135068605, "loss": 0.6982, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2210 }, { "epoch": 2.8065739570164348, "grad_norm": 0.8378614783287048, "learning_rate": 0.00016962189558967022, "loss": 0.6794, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2220 }, { "epoch": 2.8192161820480406, "grad_norm": 0.9050717949867249, "learning_rate": 0.00016934444694620217, "loss": 0.6967, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2230 }, { "epoch": 2.831858407079646, "grad_norm": 0.8742629289627075, "learning_rate": 0.00016906609181265654, "loss": 0.679, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2240 }, { "epoch": 2.8445006321112514, "grad_norm": 1.250222086906433, "learning_rate": 0.0001687868366022932, "loss": 0.6866, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2250 }, { "epoch": 2.857142857142857, "grad_norm": 0.7830986380577087, "learning_rate": 0.0001685066877491098, "loss": 0.7064, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2260 }, { "epoch": 2.8697850821744626, "grad_norm": 0.837334394454956, "learning_rate": 0.0001682256517076933, "loss": 0.676, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2270 }, { "epoch": 2.8824273072060684, "grad_norm": 1.7227693796157837, "learning_rate": 0.00016794373495307148, "loss": 0.6901, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2280 }, { "epoch": 2.895069532237674, "grad_norm": 0.7620822191238403, "learning_rate": 0.00016766094398056337, "loss": 0.687, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2290 }, { "epoch": 2.907711757269279, "grad_norm": 0.8214982748031616, "learning_rate": 0.00016737728530563013, "loss": 0.7061, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2300 }, { "epoch": 2.920353982300885, "grad_norm": 0.9066684246063232, "learning_rate": 0.00016709276546372448, "loss": 0.7271, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2310 }, { "epoch": 2.9329962073324904, "grad_norm": 0.9356798529624939, "learning_rate": 0.00016680739101014024, "loss": 0.6965, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2320 }, { "epoch": 2.9456384323640963, "grad_norm": 0.8414567112922668, "learning_rate": 0.0001665211685198616, "loss": 0.6829, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2330 }, { "epoch": 2.9582806573957017, "grad_norm": 0.9581737518310547, "learning_rate": 0.0001662341045874111, "loss": 0.6781, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2340 }, { "epoch": 2.970922882427307, "grad_norm": 0.7672229409217834, "learning_rate": 0.0001659462058266982, "loss": 0.7107, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2350 }, { "epoch": 2.983565107458913, "grad_norm": 0.8876848816871643, "learning_rate": 0.0001656574788708665, "loss": 0.682, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2360 }, { "epoch": 2.9962073324905183, "grad_norm": 0.7291796207427979, "learning_rate": 0.00016536793037214134, "loss": 0.7012, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2370 }, { "epoch": 3.0088495575221237, "grad_norm": 1.178667426109314, "learning_rate": 0.00016507756700167588, "loss": 0.5861, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2380 }, { "epoch": 3.0214917825537295, "grad_norm": 1.044280767440796, "learning_rate": 0.00016478639544939826, "loss": 0.5248, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2390 }, { "epoch": 3.034134007585335, "grad_norm": 0.8499409556388855, "learning_rate": 0.00016449442242385672, "loss": 0.5314, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2400 }, { "epoch": 3.0467762326169407, "grad_norm": 0.8145996332168579, "learning_rate": 0.00016420165465206535, "loss": 0.5681, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2410 }, { "epoch": 3.059418457648546, "grad_norm": 0.8090763688087463, "learning_rate": 0.00016390809887934914, "loss": 0.4982, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2420 }, { "epoch": 3.0720606826801515, "grad_norm": 0.7884716391563416, "learning_rate": 0.00016361376186918846, "loss": 0.5338, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2430 }, { "epoch": 3.0847029077117574, "grad_norm": 1.035247564315796, "learning_rate": 0.00016331865040306335, "loss": 0.521, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2440 }, { "epoch": 3.0973451327433628, "grad_norm": 1.029201865196228, "learning_rate": 0.00016302277128029706, "loss": 0.5391, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2450 }, { "epoch": 3.1099873577749686, "grad_norm": 0.8100953102111816, "learning_rate": 0.00016272613131789964, "loss": 0.5141, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2460 }, { "epoch": 3.122629582806574, "grad_norm": 1.0345860719680786, "learning_rate": 0.0001624287373504107, "loss": 0.5576, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2470 }, { "epoch": 3.1352718078381794, "grad_norm": 0.9381860494613647, "learning_rate": 0.00016213059622974214, "loss": 0.5373, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2480 }, { "epoch": 3.147914032869785, "grad_norm": 0.8504341244697571, "learning_rate": 0.00016183171482502003, "loss": 0.5312, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2490 }, { "epoch": 3.1605562579013906, "grad_norm": 1.0047380924224854, "learning_rate": 0.00016153210002242644, "loss": 0.5515, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2500 }, { "epoch": 3.173198482932996, "grad_norm": 0.8505437970161438, "learning_rate": 0.00016123175872504098, "loss": 0.5257, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2510 }, { "epoch": 3.185840707964602, "grad_norm": 1.0271879434585571, "learning_rate": 0.00016093069785268137, "loss": 0.5785, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2520 }, { "epoch": 3.1984829329962072, "grad_norm": 1.0047165155410767, "learning_rate": 0.00016062892434174443, "loss": 0.5373, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2530 }, { "epoch": 3.211125158027813, "grad_norm": 0.9564666152000427, "learning_rate": 0.00016032644514504604, "loss": 0.5285, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2540 }, { "epoch": 3.2237673830594185, "grad_norm": 0.959581732749939, "learning_rate": 0.00016002326723166084, "loss": 0.5813, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2550 }, { "epoch": 3.236409608091024, "grad_norm": 1.3242567777633667, "learning_rate": 0.00015971939758676186, "loss": 0.5669, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2560 }, { "epoch": 3.2490518331226297, "grad_norm": 0.9959767460823059, "learning_rate": 0.00015941484321145953, "loss": 0.5766, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2570 }, { "epoch": 3.261694058154235, "grad_norm": 0.8573315739631653, "learning_rate": 0.0001591096111226405, "loss": 0.5421, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2580 }, { "epoch": 3.274336283185841, "grad_norm": 0.8555790781974792, "learning_rate": 0.00015880370835280553, "loss": 0.5606, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2590 }, { "epoch": 3.2869785082174463, "grad_norm": 1.0024107694625854, "learning_rate": 0.00015849714194990803, "loss": 0.5406, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2600 }, { "epoch": 3.2996207332490517, "grad_norm": 0.867758572101593, "learning_rate": 0.00015818991897719134, "loss": 0.5825, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2610 }, { "epoch": 3.3122629582806575, "grad_norm": 0.828178346157074, "learning_rate": 0.00015788204651302602, "loss": 0.5528, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2620 }, { "epoch": 3.324905183312263, "grad_norm": 0.9778569936752319, "learning_rate": 0.00015757353165074685, "loss": 0.5857, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2630 }, { "epoch": 3.3375474083438688, "grad_norm": 0.9606329798698425, "learning_rate": 0.0001572643814984894, "loss": 0.6056, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2640 }, { "epoch": 3.350189633375474, "grad_norm": 0.8577843308448792, "learning_rate": 0.00015695460317902615, "loss": 0.6096, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2650 }, { "epoch": 3.3628318584070795, "grad_norm": 0.8798738718032837, "learning_rate": 0.00015664420382960256, "loss": 0.5979, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2660 }, { "epoch": 3.3754740834386854, "grad_norm": 0.887492835521698, "learning_rate": 0.00015633319060177233, "loss": 0.5962, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2670 }, { "epoch": 3.3881163084702908, "grad_norm": 0.8709145784378052, "learning_rate": 0.00015602157066123311, "loss": 0.5647, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2680 }, { "epoch": 3.400758533501896, "grad_norm": 0.8413789868354797, "learning_rate": 0.00015570935118766087, "loss": 0.5846, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2690 }, { "epoch": 3.413400758533502, "grad_norm": 0.9737523198127747, "learning_rate": 0.00015539653937454487, "loss": 0.5963, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2700 }, { "epoch": 3.4260429835651074, "grad_norm": 1.0053389072418213, "learning_rate": 0.00015508314242902173, "loss": 0.599, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2710 }, { "epoch": 3.438685208596713, "grad_norm": 0.9921556115150452, "learning_rate": 0.00015476916757170943, "loss": 0.5698, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2720 }, { "epoch": 3.4513274336283186, "grad_norm": 0.9468759298324585, "learning_rate": 0.00015445462203654098, "loss": 0.5886, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2730 }, { "epoch": 3.463969658659924, "grad_norm": 0.9463483095169067, "learning_rate": 0.0001541395130705977, "loss": 0.5829, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2740 }, { "epoch": 3.47661188369153, "grad_norm": 0.9554671049118042, "learning_rate": 0.00015382384793394223, "loss": 0.6186, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2750 }, { "epoch": 3.4892541087231352, "grad_norm": 0.7925019860267639, "learning_rate": 0.0001535076338994514, "loss": 0.5796, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2760 }, { "epoch": 3.5018963337547406, "grad_norm": 0.92326819896698, "learning_rate": 0.00015319087825264846, "loss": 0.5647, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2770 }, { "epoch": 3.5145385587863465, "grad_norm": 0.9871057868003845, "learning_rate": 0.0001528735882915354, "loss": 0.5622, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2780 }, { "epoch": 3.527180783817952, "grad_norm": 0.9997586607933044, "learning_rate": 0.00015255577132642468, "loss": 0.629, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2790 }, { "epoch": 3.5398230088495577, "grad_norm": 0.8749852180480957, "learning_rate": 0.00015223743467977088, "loss": 0.5883, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2800 }, { "epoch": 3.552465233881163, "grad_norm": 0.8085633516311646, "learning_rate": 0.00015191858568600194, "loss": 0.5713, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2810 }, { "epoch": 3.5651074589127685, "grad_norm": 0.951021134853363, "learning_rate": 0.00015159923169135025, "loss": 0.5965, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2820 }, { "epoch": 3.5777496839443743, "grad_norm": 0.9590179324150085, "learning_rate": 0.00015127938005368323, "loss": 0.5678, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2830 }, { "epoch": 3.5903919089759797, "grad_norm": 0.9921982884407043, "learning_rate": 0.0001509590381423341, "loss": 0.6115, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2840 }, { "epoch": 3.6030341340075855, "grad_norm": 0.8661071062088013, "learning_rate": 0.00015063821333793172, "loss": 0.6495, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2850 }, { "epoch": 3.615676359039191, "grad_norm": 0.8504185080528259, "learning_rate": 0.00015031691303223088, "loss": 0.5922, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2860 }, { "epoch": 3.6283185840707963, "grad_norm": 0.8301743865013123, "learning_rate": 0.00014999514462794175, "loss": 0.6227, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2870 }, { "epoch": 3.640960809102402, "grad_norm": 0.8586485385894775, "learning_rate": 0.0001496729155385595, "loss": 0.5801, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2880 }, { "epoch": 3.6536030341340076, "grad_norm": 0.8772161602973938, "learning_rate": 0.00014935023318819334, "loss": 0.5712, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2890 }, { "epoch": 3.6662452591656134, "grad_norm": 0.8610823750495911, "learning_rate": 0.00014902710501139556, "loss": 0.6007, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2900 }, { "epoch": 3.678887484197219, "grad_norm": 0.8283450603485107, "learning_rate": 0.0001487035384529903, "loss": 0.5757, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2910 }, { "epoch": 3.691529709228824, "grad_norm": 0.9658201336860657, "learning_rate": 0.00014837954096790182, "loss": 0.5899, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2920 }, { "epoch": 3.7041719342604296, "grad_norm": 0.8245115280151367, "learning_rate": 0.000148055120020983, "loss": 0.6165, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2930 }, { "epoch": 3.7168141592920354, "grad_norm": 0.9648094177246094, "learning_rate": 0.00014773028308684308, "loss": 0.6212, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2940 }, { "epoch": 3.7294563843236412, "grad_norm": 0.8854801654815674, "learning_rate": 0.00014740503764967572, "loss": 0.5777, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2950 }, { "epoch": 3.7420986093552466, "grad_norm": 0.8945504426956177, "learning_rate": 0.0001470793912030863, "loss": 0.6091, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2960 }, { "epoch": 3.754740834386852, "grad_norm": 0.8189816474914551, "learning_rate": 0.00014675335124991946, "loss": 0.6035, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2970 }, { "epoch": 3.7673830594184574, "grad_norm": 0.990737795829773, "learning_rate": 0.0001464269253020862, "loss": 0.5983, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2980 }, { "epoch": 3.7800252844500632, "grad_norm": 0.8247061371803284, "learning_rate": 0.00014610012088039077, "loss": 0.6056, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 2990 }, { "epoch": 3.7926675094816686, "grad_norm": 0.8422549962997437, "learning_rate": 0.00014577294551435728, "loss": 0.6077, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3000 }, { "epoch": 3.8053097345132745, "grad_norm": 0.9468559622764587, "learning_rate": 0.00014544540674205647, "loss": 0.592, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3010 }, { "epoch": 3.81795195954488, "grad_norm": 0.8015314340591431, "learning_rate": 0.0001451175121099319, "loss": 0.5701, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3020 }, { "epoch": 3.8305941845764853, "grad_norm": 0.896016001701355, "learning_rate": 0.00014478926917262607, "loss": 0.5985, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3030 }, { "epoch": 3.843236409608091, "grad_norm": 0.965329110622406, "learning_rate": 0.00014446068549280633, "loss": 0.5693, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3040 }, { "epoch": 3.8558786346396965, "grad_norm": 1.032674789428711, "learning_rate": 0.0001441317686409907, "loss": 0.6207, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3050 }, { "epoch": 3.8685208596713023, "grad_norm": 1.4689821004867554, "learning_rate": 0.00014380252619537355, "loss": 0.6192, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3060 }, { "epoch": 3.8811630847029077, "grad_norm": 0.9344895482063293, "learning_rate": 0.00014347296574165067, "loss": 0.5951, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3070 }, { "epoch": 3.893805309734513, "grad_norm": 0.9095802903175354, "learning_rate": 0.00014314309487284486, "loss": 0.609, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3080 }, { "epoch": 3.906447534766119, "grad_norm": 0.9843701720237732, "learning_rate": 0.00014281292118913084, "loss": 0.6107, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3090 }, { "epoch": 3.9190897597977243, "grad_norm": 0.8768340349197388, "learning_rate": 0.00014248245229766005, "loss": 0.6268, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3100 }, { "epoch": 3.93173198482933, "grad_norm": 0.9411798715591431, "learning_rate": 0.00014215169581238558, "loss": 0.6191, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3110 }, { "epoch": 3.9443742098609356, "grad_norm": 0.8732224106788635, "learning_rate": 0.0001418206593538865, "loss": 0.614, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3120 }, { "epoch": 3.957016434892541, "grad_norm": 0.8646383285522461, "learning_rate": 0.00014148935054919258, "loss": 0.6135, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3130 }, { "epoch": 3.969658659924147, "grad_norm": 0.8871061205863953, "learning_rate": 0.00014115777703160824, "loss": 0.5987, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3140 }, { "epoch": 3.982300884955752, "grad_norm": 0.7898637652397156, "learning_rate": 0.00014082594644053702, "loss": 0.6069, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3150 }, { "epoch": 3.994943109987358, "grad_norm": 0.8474721908569336, "learning_rate": 0.00014049386642130522, "loss": 0.5762, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3160 }, { "epoch": 4.007585335018963, "grad_norm": 1.1347519159317017, "learning_rate": 0.0001401615446249861, "loss": 0.4878, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3170 }, { "epoch": 4.020227560050569, "grad_norm": 1.0003758668899536, "learning_rate": 0.00013982898870822322, "loss": 0.4266, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3180 }, { "epoch": 4.032869785082174, "grad_norm": 1.7353389263153076, "learning_rate": 0.00013949620633305445, "loss": 0.4278, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3190 }, { "epoch": 4.04551201011378, "grad_norm": 0.8438617587089539, "learning_rate": 0.00013916320516673512, "loss": 0.4255, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3200 }, { "epoch": 4.058154235145386, "grad_norm": 0.9081391096115112, "learning_rate": 0.00013882999288156145, "loss": 0.4332, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3210 }, { "epoch": 4.070796460176991, "grad_norm": 0.8712509274482727, "learning_rate": 0.00013849657715469385, "loss": 0.4263, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3220 }, { "epoch": 4.083438685208597, "grad_norm": 0.8926701545715332, "learning_rate": 0.00013816296566798006, "loss": 0.4265, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3230 }, { "epoch": 4.096080910240202, "grad_norm": 1.0100903511047363, "learning_rate": 0.00013782916610777793, "loss": 0.4601, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3240 }, { "epoch": 4.108723135271807, "grad_norm": 0.9108811616897583, "learning_rate": 0.00013749518616477867, "loss": 0.4426, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3250 }, { "epoch": 4.121365360303414, "grad_norm": 1.0556674003601074, "learning_rate": 0.00013716103353382937, "loss": 0.4641, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3260 }, { "epoch": 4.134007585335019, "grad_norm": 0.8797064423561096, "learning_rate": 0.0001368267159137559, "loss": 0.4522, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3270 }, { "epoch": 4.1466498103666245, "grad_norm": 0.9286285042762756, "learning_rate": 0.0001364922410071853, "loss": 0.4684, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3280 }, { "epoch": 4.15929203539823, "grad_norm": 0.9558693170547485, "learning_rate": 0.00013615761652036872, "loss": 0.4597, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3290 }, { "epoch": 4.171934260429835, "grad_norm": 0.8957265615463257, "learning_rate": 0.00013582285016300338, "loss": 0.5033, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3300 }, { "epoch": 4.184576485461442, "grad_norm": 0.8720874786376953, "learning_rate": 0.00013548794964805531, "loss": 0.4636, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3310 }, { "epoch": 4.197218710493047, "grad_norm": 0.9207468628883362, "learning_rate": 0.0001351529226915815, "loss": 0.4555, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3320 }, { "epoch": 4.209860935524652, "grad_norm": 0.8886120319366455, "learning_rate": 0.000134817777012552, "loss": 0.4391, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3330 }, { "epoch": 4.222503160556258, "grad_norm": 0.9986599087715149, "learning_rate": 0.00013448252033267246, "loss": 0.4848, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3340 }, { "epoch": 4.235145385587863, "grad_norm": 3.081392288208008, "learning_rate": 0.0001341471603762057, "loss": 0.5096, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3350 }, { "epoch": 4.247787610619469, "grad_norm": 1.0110422372817993, "learning_rate": 0.00013381170486979427, "loss": 0.4758, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3360 }, { "epoch": 4.260429835651075, "grad_norm": 0.9332578182220459, "learning_rate": 0.00013347616154228193, "loss": 0.4607, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3370 }, { "epoch": 4.27307206068268, "grad_norm": 1.1386651992797852, "learning_rate": 0.00013314053812453605, "loss": 0.4882, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3380 }, { "epoch": 4.285714285714286, "grad_norm": 0.8812234401702881, "learning_rate": 0.0001328048423492691, "loss": 0.454, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3390 }, { "epoch": 4.298356510745891, "grad_norm": 0.9429104328155518, "learning_rate": 0.00013246908195086072, "loss": 0.4724, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3400 }, { "epoch": 4.310998735777497, "grad_norm": 0.9410486817359924, "learning_rate": 0.0001321332646651795, "loss": 0.4516, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3410 }, { "epoch": 4.323640960809103, "grad_norm": 0.9896162748336792, "learning_rate": 0.00013179739822940454, "loss": 0.4949, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3420 }, { "epoch": 4.336283185840708, "grad_norm": 0.9165130853652954, "learning_rate": 0.00013146149038184768, "loss": 0.487, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3430 }, { "epoch": 4.348925410872313, "grad_norm": 2.110687494277954, "learning_rate": 0.00013112554886177447, "loss": 0.5062, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3440 }, { "epoch": 4.361567635903919, "grad_norm": 0.8859379887580872, "learning_rate": 0.0001307895814092266, "loss": 0.4587, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3450 }, { "epoch": 4.374209860935524, "grad_norm": 1.0231775045394897, "learning_rate": 0.00013045359576484305, "loss": 0.5083, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3460 }, { "epoch": 4.3868520859671305, "grad_norm": 1.0273702144622803, "learning_rate": 0.00013011759966968204, "loss": 0.4849, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3470 }, { "epoch": 4.399494310998736, "grad_norm": 0.9449805617332458, "learning_rate": 0.0001297816008650425, "loss": 0.493, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3480 }, { "epoch": 4.412136536030341, "grad_norm": 0.8178017735481262, "learning_rate": 0.00012944560709228587, "loss": 0.464, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3490 }, { "epoch": 4.424778761061947, "grad_norm": 1.0193867683410645, "learning_rate": 0.00012910962609265754, "loss": 0.4721, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3500 }, { "epoch": 4.437420986093552, "grad_norm": 1.1380479335784912, "learning_rate": 0.00012877366560710868, "loss": 0.4589, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3510 }, { "epoch": 4.450063211125158, "grad_norm": 0.8772681951522827, "learning_rate": 0.00012843773337611788, "loss": 0.4642, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3520 }, { "epoch": 4.462705436156764, "grad_norm": 0.9058607220649719, "learning_rate": 0.00012810183713951264, "loss": 0.5033, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3530 }, { "epoch": 4.475347661188369, "grad_norm": 0.938266932964325, "learning_rate": 0.00012776598463629118, "loss": 0.5098, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3540 }, { "epoch": 4.4879898862199745, "grad_norm": 1.0325732231140137, "learning_rate": 0.00012743018360444422, "loss": 0.4833, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3550 }, { "epoch": 4.50063211125158, "grad_norm": 0.8300301432609558, "learning_rate": 0.0001270944417807763, "loss": 0.4815, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3560 }, { "epoch": 4.513274336283186, "grad_norm": 0.941461443901062, "learning_rate": 0.00012675876690072823, "loss": 0.4942, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3570 }, { "epoch": 4.525916561314792, "grad_norm": 0.8629696369171143, "learning_rate": 0.00012642316669819812, "loss": 0.5091, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3580 }, { "epoch": 4.538558786346397, "grad_norm": 0.9793810844421387, "learning_rate": 0.0001260876489053636, "loss": 0.52, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3590 }, { "epoch": 4.551201011378002, "grad_norm": 0.9196791052818298, "learning_rate": 0.00012575222125250365, "loss": 0.4884, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3600 }, { "epoch": 4.563843236409608, "grad_norm": 1.0433666706085205, "learning_rate": 0.00012541689146782048, "loss": 0.5041, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3610 }, { "epoch": 4.576485461441214, "grad_norm": 1.0952868461608887, "learning_rate": 0.00012508166727726128, "loss": 0.5117, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3620 }, { "epoch": 4.589127686472819, "grad_norm": 1.039157748222351, "learning_rate": 0.00012474655640434042, "loss": 0.5028, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3630 }, { "epoch": 4.601769911504425, "grad_norm": 1.044838786125183, "learning_rate": 0.00012441156656996155, "loss": 0.4941, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3640 }, { "epoch": 4.61441213653603, "grad_norm": 1.0558874607086182, "learning_rate": 0.00012407670549223953, "loss": 0.516, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3650 }, { "epoch": 4.627054361567636, "grad_norm": 0.9311762452125549, "learning_rate": 0.0001237419808863227, "loss": 0.4933, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3660 }, { "epoch": 4.639696586599241, "grad_norm": 1.0576010942459106, "learning_rate": 0.00012340740046421506, "loss": 0.5119, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3670 }, { "epoch": 4.652338811630847, "grad_norm": 0.9502875208854675, "learning_rate": 0.0001230729719345987, "loss": 0.4875, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3680 }, { "epoch": 4.664981036662453, "grad_norm": 0.9513876438140869, "learning_rate": 0.00012273870300265612, "loss": 0.4836, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3690 }, { "epoch": 4.677623261694058, "grad_norm": 1.0516324043273926, "learning_rate": 0.00012240460136989274, "loss": 0.5168, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3700 }, { "epoch": 4.6902654867256635, "grad_norm": 0.9066925644874573, "learning_rate": 0.00012207067473395935, "loss": 0.4947, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3710 }, { "epoch": 4.70290771175727, "grad_norm": 0.9543781876564026, "learning_rate": 0.00012173693078847487, "loss": 0.5155, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3720 }, { "epoch": 4.715549936788875, "grad_norm": 0.9955562949180603, "learning_rate": 0.00012140337722284914, "loss": 0.5302, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3730 }, { "epoch": 4.7281921618204805, "grad_norm": 4.362971305847168, "learning_rate": 0.00012107002172210559, "loss": 0.5438, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3740 }, { "epoch": 4.740834386852086, "grad_norm": 1.0576658248901367, "learning_rate": 0.00012073687196670429, "loss": 0.536, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3750 }, { "epoch": 4.753476611883691, "grad_norm": 0.946419894695282, "learning_rate": 0.00012040393563236494, "loss": 0.5253, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3760 }, { "epoch": 4.766118836915297, "grad_norm": 0.9340927004814148, "learning_rate": 0.00012007122038989012, "loss": 0.5117, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3770 }, { "epoch": 4.778761061946903, "grad_norm": 0.9391945600509644, "learning_rate": 0.00011973873390498841, "loss": 0.5132, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3780 }, { "epoch": 4.791403286978508, "grad_norm": 0.9951459169387817, "learning_rate": 0.00011940648383809794, "loss": 0.5356, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3790 }, { "epoch": 4.804045512010114, "grad_norm": 1.0087045431137085, "learning_rate": 0.00011907447784420974, "loss": 0.4949, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3800 }, { "epoch": 4.816687737041719, "grad_norm": 1.0418733358383179, "learning_rate": 0.00011874272357269138, "loss": 0.5044, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3810 }, { "epoch": 4.8293299620733245, "grad_norm": 0.9647939801216125, "learning_rate": 0.0001184112286671109, "loss": 0.519, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3820 }, { "epoch": 4.841972187104931, "grad_norm": 0.9896367788314819, "learning_rate": 0.00011808000076506056, "loss": 0.5376, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3830 }, { "epoch": 4.854614412136536, "grad_norm": 1.1160699129104614, "learning_rate": 0.00011774904749798086, "loss": 0.4941, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3840 }, { "epoch": 4.867256637168142, "grad_norm": 0.9226526021957397, "learning_rate": 0.00011741837649098477, "loss": 0.5044, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3850 }, { "epoch": 4.879898862199747, "grad_norm": 0.959432065486908, "learning_rate": 0.00011708799536268202, "loss": 0.5051, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3860 }, { "epoch": 4.892541087231352, "grad_norm": 0.8908069729804993, "learning_rate": 0.0001167579117250036, "loss": 0.5226, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3870 }, { "epoch": 4.905183312262958, "grad_norm": 0.8914538025856018, "learning_rate": 0.00011642813318302639, "loss": 0.4971, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3880 }, { "epoch": 4.917825537294564, "grad_norm": 0.940838098526001, "learning_rate": 0.00011609866733479784, "loss": 0.5349, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3890 }, { "epoch": 4.9304677623261695, "grad_norm": 0.9459583759307861, "learning_rate": 0.00011576952177116095, "loss": 0.5137, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3900 }, { "epoch": 4.943109987357775, "grad_norm": 0.988993227481842, "learning_rate": 0.00011544070407557961, "loss": 0.5061, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3910 }, { "epoch": 4.95575221238938, "grad_norm": 0.8528466820716858, "learning_rate": 0.00011511222182396349, "loss": 0.4997, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3920 }, { "epoch": 4.9683944374209865, "grad_norm": 0.9346151351928711, "learning_rate": 0.00011478408258449373, "loss": 0.5347, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3930 }, { "epoch": 4.981036662452592, "grad_norm": 0.9937970638275146, "learning_rate": 0.00011445629391744854, "loss": 0.5138, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3940 }, { "epoch": 4.993678887484197, "grad_norm": 1.021466612815857, "learning_rate": 0.00011412886337502894, "loss": 0.4953, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3950 }, { "epoch": 5.006321112515803, "grad_norm": 0.8485009074211121, "learning_rate": 0.00011380179850118495, "loss": 0.4504, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3960 }, { "epoch": 5.018963337547408, "grad_norm": 0.9451215267181396, "learning_rate": 0.00011347510683144151, "loss": 0.3505, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3970 }, { "epoch": 5.0316055625790135, "grad_norm": 0.9910890460014343, "learning_rate": 0.00011314879589272505, "loss": 0.3889, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3980 }, { "epoch": 5.04424778761062, "grad_norm": 1.070092797279358, "learning_rate": 0.00011282287320318996, "loss": 0.3514, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 3990 }, { "epoch": 5.056890012642225, "grad_norm": 0.9985383749008179, "learning_rate": 0.0001124973462720455, "loss": 0.3563, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4000 }, { "epoch": 5.0695322376738305, "grad_norm": 0.8897594213485718, "learning_rate": 0.00011217222259938272, "loss": 0.3402, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4010 }, { "epoch": 5.082174462705436, "grad_norm": 0.981590211391449, "learning_rate": 0.00011184750967600157, "loss": 0.4163, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4020 }, { "epoch": 5.094816687737041, "grad_norm": 0.8742545247077942, "learning_rate": 0.00011152321498323846, "loss": 0.3477, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4030 }, { "epoch": 5.107458912768648, "grad_norm": 0.9774489402770996, "learning_rate": 0.0001111993459927938, "loss": 0.3722, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4040 }, { "epoch": 5.120101137800253, "grad_norm": 0.9024301171302795, "learning_rate": 0.00011087591016656001, "loss": 0.3531, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4050 }, { "epoch": 5.132743362831858, "grad_norm": 0.9952253103256226, "learning_rate": 0.00011055291495644926, "loss": 0.3762, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4060 }, { "epoch": 5.145385587863464, "grad_norm": 0.9904897809028625, "learning_rate": 0.00011023036780422212, "loss": 0.4032, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4070 }, { "epoch": 5.158027812895069, "grad_norm": 0.9370035529136658, "learning_rate": 0.00010990827614131594, "loss": 0.3717, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4080 }, { "epoch": 5.1706700379266755, "grad_norm": 1.055816650390625, "learning_rate": 0.00010958664738867372, "loss": 0.3958, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4090 }, { "epoch": 5.183312262958281, "grad_norm": 1.0066580772399902, "learning_rate": 0.00010926548895657303, "loss": 0.3793, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4100 }, { "epoch": 5.195954487989886, "grad_norm": 1.0231560468673706, "learning_rate": 0.00010894480824445532, "loss": 0.3813, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4110 }, { "epoch": 5.208596713021492, "grad_norm": 0.9747928977012634, "learning_rate": 0.00010862461264075542, "loss": 0.3594, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4120 }, { "epoch": 5.221238938053097, "grad_norm": 1.0806195735931396, "learning_rate": 0.00010830490952273145, "loss": 0.3956, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4130 }, { "epoch": 5.233881163084703, "grad_norm": 1.0321904420852661, "learning_rate": 0.00010798570625629461, "loss": 0.3585, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4140 }, { "epoch": 5.246523388116309, "grad_norm": 1.2540595531463623, "learning_rate": 0.00010766701019583967, "loss": 0.391, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4150 }, { "epoch": 5.259165613147914, "grad_norm": 1.035423994064331, "learning_rate": 0.00010734882868407537, "loss": 0.4028, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4160 }, { "epoch": 5.2718078381795195, "grad_norm": 1.2022385597229004, "learning_rate": 0.00010703116905185541, "loss": 0.3841, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4170 }, { "epoch": 5.284450063211125, "grad_norm": 1.045843482017517, "learning_rate": 0.00010671403861800946, "loss": 0.3939, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4180 }, { "epoch": 5.29709228824273, "grad_norm": 0.9559326767921448, "learning_rate": 0.00010639744468917447, "loss": 0.3801, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4190 }, { "epoch": 5.3097345132743365, "grad_norm": 1.033033847808838, "learning_rate": 0.0001060813945596265, "loss": 0.3846, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4200 }, { "epoch": 5.322376738305942, "grad_norm": 0.9737249612808228, "learning_rate": 0.00010576589551111242, "loss": 0.39, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4210 }, { "epoch": 5.335018963337547, "grad_norm": 1.0500105619430542, "learning_rate": 0.00010545095481268241, "loss": 0.3713, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4220 }, { "epoch": 5.347661188369153, "grad_norm": 1.1261670589447021, "learning_rate": 0.00010513657972052228, "loss": 0.4112, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4230 }, { "epoch": 5.360303413400759, "grad_norm": 0.9046671390533447, "learning_rate": 0.0001048227774777864, "loss": 0.3963, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4240 }, { "epoch": 5.372945638432364, "grad_norm": 1.0187987089157104, "learning_rate": 0.00010450955531443067, "loss": 0.3954, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4250 }, { "epoch": 5.38558786346397, "grad_norm": 0.9995326399803162, "learning_rate": 0.00010419692044704624, "loss": 0.3996, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4260 }, { "epoch": 5.398230088495575, "grad_norm": 0.9701279997825623, "learning_rate": 0.00010388488007869282, "loss": 0.3805, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4270 }, { "epoch": 5.410872313527181, "grad_norm": 0.9126356840133667, "learning_rate": 0.00010357344139873315, "loss": 0.3862, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4280 }, { "epoch": 5.423514538558786, "grad_norm": 0.9048483371734619, "learning_rate": 0.00010326261158266701, "loss": 0.3767, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4290 }, { "epoch": 5.436156763590392, "grad_norm": 0.9570040702819824, "learning_rate": 0.0001029523977919662, "loss": 0.3875, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4300 }, { "epoch": 5.448798988621998, "grad_norm": 1.0698267221450806, "learning_rate": 0.00010264280717390927, "loss": 0.4159, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4310 }, { "epoch": 5.461441213653603, "grad_norm": 1.03220796585083, "learning_rate": 0.00010233384686141701, "loss": 0.4062, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4320 }, { "epoch": 5.474083438685208, "grad_norm": 0.9866275787353516, "learning_rate": 0.00010202552397288805, "loss": 0.4064, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4330 }, { "epoch": 5.486725663716814, "grad_norm": 0.9090940356254578, "learning_rate": 0.00010171784561203485, "loss": 0.4178, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4340 }, { "epoch": 5.49936788874842, "grad_norm": 1.0094218254089355, "learning_rate": 0.00010141081886772013, "loss": 0.4046, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4350 }, { "epoch": 5.5120101137800255, "grad_norm": 0.9741319417953491, "learning_rate": 0.00010110445081379343, "loss": 0.3957, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4360 }, { "epoch": 5.524652338811631, "grad_norm": 1.186471700668335, "learning_rate": 0.00010079874850892808, "loss": 0.4112, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4370 }, { "epoch": 5.537294563843236, "grad_norm": 1.0046883821487427, "learning_rate": 0.00010049371899645874, "loss": 0.3976, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4380 }, { "epoch": 5.549936788874842, "grad_norm": 2.301224946975708, "learning_rate": 0.00010018936930421907, "loss": 0.4381, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4390 }, { "epoch": 5.562579013906447, "grad_norm": 1.1555812358856201, "learning_rate": 9.988570644437969e-05, "loss": 0.4139, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4400 }, { "epoch": 5.575221238938053, "grad_norm": 1.0925663709640503, "learning_rate": 9.958273741328672e-05, "loss": 0.376, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4410 }, { "epoch": 5.587863463969659, "grad_norm": 1.0395334959030151, "learning_rate": 9.928046919130056e-05, "loss": 0.4696, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4420 }, { "epoch": 5.600505689001264, "grad_norm": 1.0506666898727417, "learning_rate": 9.897890874263518e-05, "loss": 0.4165, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4430 }, { "epoch": 5.6131479140328695, "grad_norm": 0.9786500930786133, "learning_rate": 9.867806301519742e-05, "loss": 0.3949, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4440 }, { "epoch": 5.625790139064476, "grad_norm": 1.0455806255340576, "learning_rate": 9.837793894042716e-05, "loss": 0.3976, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4450 }, { "epoch": 5.638432364096081, "grad_norm": 0.9991239905357361, "learning_rate": 9.807854343313739e-05, "loss": 0.3862, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4460 }, { "epoch": 5.651074589127687, "grad_norm": 1.0253965854644775, "learning_rate": 9.777988339135517e-05, "loss": 0.3859, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4470 }, { "epoch": 5.663716814159292, "grad_norm": 0.9867163300514221, "learning_rate": 9.748196569616245e-05, "loss": 0.401, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4480 }, { "epoch": 5.676359039190897, "grad_norm": 0.9973002672195435, "learning_rate": 9.718479721153764e-05, "loss": 0.4055, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4490 }, { "epoch": 5.689001264222503, "grad_norm": 1.03886079788208, "learning_rate": 9.688838478419746e-05, "loss": 0.4031, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4500 }, { "epoch": 5.701643489254109, "grad_norm": 1.1662676334381104, "learning_rate": 9.659273524343917e-05, "loss": 0.3998, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4510 }, { "epoch": 5.714285714285714, "grad_norm": 0.9785062670707703, "learning_rate": 9.629785540098329e-05, "loss": 0.3925, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4520 }, { "epoch": 5.72692793931732, "grad_norm": 1.0249117612838745, "learning_rate": 9.600375205081654e-05, "loss": 0.4195, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4530 }, { "epoch": 5.739570164348925, "grad_norm": 1.0373821258544922, "learning_rate": 9.571043196903541e-05, "loss": 0.4197, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4540 }, { "epoch": 5.752212389380531, "grad_norm": 0.9370099306106567, "learning_rate": 9.541790191368998e-05, "loss": 0.39, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4550 }, { "epoch": 5.764854614412137, "grad_norm": 1.0252115726470947, "learning_rate": 9.512616862462831e-05, "loss": 0.408, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4560 }, { "epoch": 5.777496839443742, "grad_norm": 1.033614158630371, "learning_rate": 9.483523882334102e-05, "loss": 0.4194, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4570 }, { "epoch": 5.790139064475348, "grad_norm": 1.1127879619598389, "learning_rate": 9.454511921280651e-05, "loss": 0.4098, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4580 }, { "epoch": 5.802781289506953, "grad_norm": 0.9151955246925354, "learning_rate": 9.425581647733652e-05, "loss": 0.4202, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4590 }, { "epoch": 5.815423514538558, "grad_norm": 1.0775083303451538, "learning_rate": 9.396733728242207e-05, "loss": 0.4181, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4600 }, { "epoch": 5.828065739570165, "grad_norm": 0.9415781497955322, "learning_rate": 9.367968827458003e-05, "loss": 0.4538, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4610 }, { "epoch": 5.84070796460177, "grad_norm": 0.9953785538673401, "learning_rate": 9.339287608119976e-05, "loss": 0.4121, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4620 }, { "epoch": 5.8533501896333755, "grad_norm": 1.0544629096984863, "learning_rate": 9.310690731039065e-05, "loss": 0.4025, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4630 }, { "epoch": 5.865992414664981, "grad_norm": 0.9968181848526001, "learning_rate": 9.282178855082963e-05, "loss": 0.4179, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4640 }, { "epoch": 5.878634639696586, "grad_norm": 0.9884583353996277, "learning_rate": 9.253752637160965e-05, "loss": 0.4345, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4650 }, { "epoch": 5.891276864728193, "grad_norm": 1.0549771785736084, "learning_rate": 9.225412732208815e-05, "loss": 0.4171, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4660 }, { "epoch": 5.903919089759798, "grad_norm": 1.058349847793579, "learning_rate": 9.19715979317361e-05, "loss": 0.3954, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4670 }, { "epoch": 5.916561314791403, "grad_norm": 0.959523618221283, "learning_rate": 9.168994470998771e-05, "loss": 0.4078, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4680 }, { "epoch": 5.929203539823009, "grad_norm": 1.0451573133468628, "learning_rate": 9.140917414609043e-05, "loss": 0.4477, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4690 }, { "epoch": 5.941845764854614, "grad_norm": 1.0435268878936768, "learning_rate": 9.112929270895536e-05, "loss": 0.3955, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4700 }, { "epoch": 5.9544879898862195, "grad_norm": 1.001197338104248, "learning_rate": 9.085030684700828e-05, "loss": 0.4086, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4710 }, { "epoch": 5.967130214917826, "grad_norm": 1.0496070384979248, "learning_rate": 9.057222298804104e-05, "loss": 0.4342, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4720 }, { "epoch": 5.979772439949431, "grad_norm": 0.955414891242981, "learning_rate": 9.029504753906348e-05, "loss": 0.4041, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4730 }, { "epoch": 5.992414664981037, "grad_norm": 3.551063060760498, "learning_rate": 9.001878688615582e-05, "loss": 0.4304, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4740 }, { "epoch": 6.005056890012642, "grad_norm": 0.8560709953308105, "learning_rate": 8.974344739432153e-05, "loss": 0.3485, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4750 }, { "epoch": 6.017699115044247, "grad_norm": 1.048302412033081, "learning_rate": 8.946903540734064e-05, "loss": 0.2697, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4760 }, { "epoch": 6.030341340075854, "grad_norm": 1.1160005331039429, "learning_rate": 8.919555724762359e-05, "loss": 0.2732, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4770 }, { "epoch": 6.042983565107459, "grad_norm": 0.9840885400772095, "learning_rate": 8.892301921606567e-05, "loss": 0.2855, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4780 }, { "epoch": 6.055625790139064, "grad_norm": 0.9168655872344971, "learning_rate": 8.865142759190168e-05, "loss": 0.2657, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4790 }, { "epoch": 6.06826801517067, "grad_norm": 0.9473972916603088, "learning_rate": 8.838078863256136e-05, "loss": 0.2808, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4800 }, { "epoch": 6.080910240202275, "grad_norm": 1.079185962677002, "learning_rate": 8.811110857352518e-05, "loss": 0.2815, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4810 }, { "epoch": 6.0935524652338815, "grad_norm": 1.0252193212509155, "learning_rate": 8.784239362818074e-05, "loss": 0.2981, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4820 }, { "epoch": 6.106194690265487, "grad_norm": 0.9863188862800598, "learning_rate": 8.757464998767951e-05, "loss": 0.2817, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4830 }, { "epoch": 6.118836915297092, "grad_norm": 0.9947652220726013, "learning_rate": 8.730788382079432e-05, "loss": 0.2946, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4840 }, { "epoch": 6.131479140328698, "grad_norm": 1.0014346837997437, "learning_rate": 8.704210127377708e-05, "loss": 0.2902, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4850 }, { "epoch": 6.144121365360303, "grad_norm": 1.0144473314285278, "learning_rate": 8.677730847021724e-05, "loss": 0.2828, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4860 }, { "epoch": 6.156763590391909, "grad_norm": 1.0776128768920898, "learning_rate": 8.651351151090082e-05, "loss": 0.306, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4870 }, { "epoch": 6.169405815423515, "grad_norm": 1.13133704662323, "learning_rate": 8.625071647366963e-05, "loss": 0.2842, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4880 }, { "epoch": 6.18204804045512, "grad_norm": 1.0843030214309692, "learning_rate": 8.598892941328137e-05, "loss": 0.2938, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4890 }, { "epoch": 6.1946902654867255, "grad_norm": 1.0806282758712769, "learning_rate": 8.572815636127013e-05, "loss": 0.3009, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4900 }, { "epoch": 6.207332490518331, "grad_norm": 1.2078369855880737, "learning_rate": 8.54684033258074e-05, "loss": 0.3298, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4910 }, { "epoch": 6.219974715549937, "grad_norm": 1.0101124048233032, "learning_rate": 8.520967629156365e-05, "loss": 0.2938, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4920 }, { "epoch": 6.232616940581543, "grad_norm": 1.0761367082595825, "learning_rate": 8.495198121957043e-05, "loss": 0.3062, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4930 }, { "epoch": 6.245259165613148, "grad_norm": 1.1186556816101074, "learning_rate": 8.469532404708298e-05, "loss": 0.3024, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4940 }, { "epoch": 6.257901390644753, "grad_norm": 1.0951234102249146, "learning_rate": 8.443971068744362e-05, "loss": 0.2902, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4950 }, { "epoch": 6.270543615676359, "grad_norm": 0.9902530908584595, "learning_rate": 8.418514702994525e-05, "loss": 0.296, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4960 }, { "epoch": 6.283185840707965, "grad_norm": 1.1143983602523804, "learning_rate": 8.393163893969586e-05, "loss": 0.3114, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4970 }, { "epoch": 6.29582806573957, "grad_norm": 1.0336135625839233, "learning_rate": 8.367919225748333e-05, "loss": 0.3308, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4980 }, { "epoch": 6.308470290771176, "grad_norm": 0.9870953559875488, "learning_rate": 8.34278127996408e-05, "loss": 0.2956, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 4990 }, { "epoch": 6.321112515802781, "grad_norm": 1.0120779275894165, "learning_rate": 8.317750635791284e-05, "loss": 0.313, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5000 }, { "epoch": 6.333754740834387, "grad_norm": 0.9608586430549622, "learning_rate": 8.292827869932179e-05, "loss": 0.3005, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5010 }, { "epoch": 6.346396965865992, "grad_norm": 1.1399952173233032, "learning_rate": 8.268013556603504e-05, "loss": 0.302, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5020 }, { "epoch": 6.359039190897598, "grad_norm": 1.1939678192138672, "learning_rate": 8.243308267523261e-05, "loss": 0.3214, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5030 }, { "epoch": 6.371681415929204, "grad_norm": 1.0933220386505127, "learning_rate": 8.218712571897564e-05, "loss": 0.3145, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5040 }, { "epoch": 6.384323640960809, "grad_norm": 1.6601200103759766, "learning_rate": 8.194227036407498e-05, "loss": 0.3069, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5050 }, { "epoch": 6.3969658659924145, "grad_norm": 1.105997920036316, "learning_rate": 8.169852225196077e-05, "loss": 0.2998, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5060 }, { "epoch": 6.40960809102402, "grad_norm": 1.0879909992218018, "learning_rate": 8.145588699855247e-05, "loss": 0.3087, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5070 }, { "epoch": 6.422250316055626, "grad_norm": 1.011335015296936, "learning_rate": 8.121437019412947e-05, "loss": 0.2982, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5080 }, { "epoch": 6.4348925410872315, "grad_norm": 1.2018229961395264, "learning_rate": 8.09739774032022e-05, "loss": 0.3272, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5090 }, { "epoch": 6.447534766118837, "grad_norm": 1.0991839170455933, "learning_rate": 8.073471416438405e-05, "loss": 0.3434, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5100 }, { "epoch": 6.460176991150442, "grad_norm": 1.228576898574829, "learning_rate": 8.049658599026369e-05, "loss": 0.3113, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5110 }, { "epoch": 6.472819216182048, "grad_norm": 1.0694067478179932, "learning_rate": 8.0259598367278e-05, "loss": 0.3114, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5120 }, { "epoch": 6.485461441213654, "grad_norm": 1.0272830724716187, "learning_rate": 8.002375675558586e-05, "loss": 0.3103, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5130 }, { "epoch": 6.498103666245259, "grad_norm": 0.974769115447998, "learning_rate": 7.978906658894213e-05, "loss": 0.3093, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5140 }, { "epoch": 6.510745891276865, "grad_norm": 1.1441291570663452, "learning_rate": 7.955553327457256e-05, "loss": 0.3317, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5150 }, { "epoch": 6.52338811630847, "grad_norm": 1.0339381694793701, "learning_rate": 7.932316219304925e-05, "loss": 0.2997, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5160 }, { "epoch": 6.5360303413400755, "grad_norm": 1.0404632091522217, "learning_rate": 7.90919586981666e-05, "loss": 0.3095, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5170 }, { "epoch": 6.548672566371682, "grad_norm": 1.1902042627334595, "learning_rate": 7.886192811681793e-05, "loss": 0.2978, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5180 }, { "epoch": 6.561314791403287, "grad_norm": 1.089690089225769, "learning_rate": 7.863307574887296e-05, "loss": 0.3103, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5190 }, { "epoch": 6.573957016434893, "grad_norm": 1.1589289903640747, "learning_rate": 7.840540686705539e-05, "loss": 0.3425, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5200 }, { "epoch": 6.586599241466498, "grad_norm": 1.0016796588897705, "learning_rate": 7.817892671682173e-05, "loss": 0.3004, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5210 }, { "epoch": 6.599241466498103, "grad_norm": 1.1263011693954468, "learning_rate": 7.795364051624015e-05, "loss": 0.3124, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5220 }, { "epoch": 6.611883691529709, "grad_norm": 1.1125059127807617, "learning_rate": 7.77295534558705e-05, "loss": 0.331, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5230 }, { "epoch": 6.624525916561315, "grad_norm": 1.1294969320297241, "learning_rate": 7.750667069864458e-05, "loss": 0.308, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5240 }, { "epoch": 6.6371681415929205, "grad_norm": 1.0179051160812378, "learning_rate": 7.728499737974723e-05, "loss": 0.3057, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5250 }, { "epoch": 6.649810366624526, "grad_norm": 1.0046980381011963, "learning_rate": 7.706453860649807e-05, "loss": 0.2959, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5260 }, { "epoch": 6.662452591656131, "grad_norm": 1.110780954360962, "learning_rate": 7.684529945823368e-05, "loss": 0.3461, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5270 }, { "epoch": 6.6750948166877375, "grad_norm": 1.0861669778823853, "learning_rate": 7.662728498619076e-05, "loss": 0.2993, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5280 }, { "epoch": 6.687737041719343, "grad_norm": 1.0867419242858887, "learning_rate": 7.641050021338954e-05, "loss": 0.3354, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5290 }, { "epoch": 6.700379266750948, "grad_norm": 1.1115156412124634, "learning_rate": 7.619495013451831e-05, "loss": 0.3177, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5300 }, { "epoch": 6.713021491782554, "grad_norm": 1.0660215616226196, "learning_rate": 7.59806397158181e-05, "loss": 0.3141, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5310 }, { "epoch": 6.725663716814159, "grad_norm": 0.9811689257621765, "learning_rate": 7.576757389496838e-05, "loss": 0.3354, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5320 }, { "epoch": 6.7383059418457645, "grad_norm": 1.0768461227416992, "learning_rate": 7.555575758097325e-05, "loss": 0.3108, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5330 }, { "epoch": 6.750948166877371, "grad_norm": 1.1170628070831299, "learning_rate": 7.534519565404843e-05, "loss": 0.3206, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5340 }, { "epoch": 6.763590391908976, "grad_norm": 0.9863327145576477, "learning_rate": 7.51358929655087e-05, "loss": 0.2973, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5350 }, { "epoch": 6.7762326169405815, "grad_norm": 1.1545705795288086, "learning_rate": 7.492785433765617e-05, "loss": 0.3393, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5360 }, { "epoch": 6.788874841972187, "grad_norm": 1.0578138828277588, "learning_rate": 7.472108456366925e-05, "loss": 0.323, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5370 }, { "epoch": 6.801517067003792, "grad_norm": 1.0187878608703613, "learning_rate": 7.451558840749207e-05, "loss": 0.3386, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5380 }, { "epoch": 6.814159292035399, "grad_norm": 1.0566604137420654, "learning_rate": 7.431137060372486e-05, "loss": 0.3161, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5390 }, { "epoch": 6.826801517067004, "grad_norm": 0.9965440034866333, "learning_rate": 7.410843585751477e-05, "loss": 0.322, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5400 }, { "epoch": 6.839443742098609, "grad_norm": 1.1252332925796509, "learning_rate": 7.390678884444751e-05, "loss": 0.3421, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5410 }, { "epoch": 6.852085967130215, "grad_norm": 1.5158134698867798, "learning_rate": 7.370643421043957e-05, "loss": 0.3375, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5420 }, { "epoch": 6.86472819216182, "grad_norm": 1.0333036184310913, "learning_rate": 7.350737657163133e-05, "loss": 0.3173, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5430 }, { "epoch": 6.877370417193426, "grad_norm": 1.0372684001922607, "learning_rate": 7.33096205142805e-05, "loss": 0.3362, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5440 }, { "epoch": 6.890012642225032, "grad_norm": 0.9757832288742065, "learning_rate": 7.311317059465658e-05, "loss": 0.3255, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5450 }, { "epoch": 6.902654867256637, "grad_norm": 1.0241106748580933, "learning_rate": 7.291803133893588e-05, "loss": 0.3146, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5460 }, { "epoch": 6.915297092288243, "grad_norm": 1.095625638961792, "learning_rate": 7.272420724309719e-05, "loss": 0.3185, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5470 }, { "epoch": 6.927939317319848, "grad_norm": 1.1619679927825928, "learning_rate": 7.25317027728182e-05, "loss": 0.3149, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5480 }, { "epoch": 6.940581542351454, "grad_norm": 1.090199589729309, "learning_rate": 7.234052236337267e-05, "loss": 0.3194, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5490 }, { "epoch": 6.95322376738306, "grad_norm": 1.0253422260284424, "learning_rate": 7.215067041952817e-05, "loss": 0.3748, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5500 }, { "epoch": 6.965865992414665, "grad_norm": 0.9982818365097046, "learning_rate": 7.196215131544458e-05, "loss": 0.3315, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5510 }, { "epoch": 6.9785082174462705, "grad_norm": 1.0616456270217896, "learning_rate": 7.177496939457349e-05, "loss": 0.3197, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5520 }, { "epoch": 6.991150442477876, "grad_norm": 1.0430032014846802, "learning_rate": 7.158912896955785e-05, "loss": 0.332, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5530 }, { "epoch": 7.003792667509481, "grad_norm": 0.9125473499298096, "learning_rate": 7.140463432213281e-05, "loss": 0.2938, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5540 }, { "epoch": 7.0164348925410875, "grad_norm": 1.0734045505523682, "learning_rate": 7.122148970302702e-05, "loss": 0.2281, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5550 }, { "epoch": 7.029077117572693, "grad_norm": 1.0642218589782715, "learning_rate": 7.103969933186467e-05, "loss": 0.2096, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5560 }, { "epoch": 7.041719342604298, "grad_norm": 1.080702304840088, "learning_rate": 7.085926739706828e-05, "loss": 0.2014, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5570 }, { "epoch": 7.054361567635904, "grad_norm": 1.0507287979125977, "learning_rate": 7.06801980557622e-05, "loss": 0.2107, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5580 }, { "epoch": 7.067003792667509, "grad_norm": 1.0190140008926392, "learning_rate": 7.050249543367683e-05, "loss": 0.2106, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5590 }, { "epoch": 7.079646017699115, "grad_norm": 1.1010105609893799, "learning_rate": 7.032616362505359e-05, "loss": 0.2142, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5600 }, { "epoch": 7.092288242730721, "grad_norm": 0.9539241194725037, "learning_rate": 7.015120669255053e-05, "loss": 0.2138, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5610 }, { "epoch": 7.104930467762326, "grad_norm": 1.2524183988571167, "learning_rate": 6.99776286671488e-05, "loss": 0.2166, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5620 }, { "epoch": 7.117572692793932, "grad_norm": 1.0015676021575928, "learning_rate": 6.980543354805969e-05, "loss": 0.2075, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5630 }, { "epoch": 7.130214917825537, "grad_norm": 1.0855770111083984, "learning_rate": 6.963462530263261e-05, "loss": 0.2322, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5640 }, { "epoch": 7.142857142857143, "grad_norm": 1.1854267120361328, "learning_rate": 6.946520786626358e-05, "loss": 0.2192, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5650 }, { "epoch": 7.155499367888749, "grad_norm": 1.1590447425842285, "learning_rate": 6.929718514230455e-05, "loss": 0.2286, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5660 }, { "epoch": 7.168141592920354, "grad_norm": 1.0713489055633545, "learning_rate": 6.913056100197355e-05, "loss": 0.2101, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5670 }, { "epoch": 7.180783817951959, "grad_norm": 1.0067224502563477, "learning_rate": 6.896533928426545e-05, "loss": 0.2191, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5680 }, { "epoch": 7.193426042983565, "grad_norm": 1.0778611898422241, "learning_rate": 6.880152379586353e-05, "loss": 0.2242, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5690 }, { "epoch": 7.206068268015171, "grad_norm": 1.1107529401779175, "learning_rate": 6.863911831105174e-05, "loss": 0.236, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5700 }, { "epoch": 7.2187104930467765, "grad_norm": 1.1352819204330444, "learning_rate": 6.847812657162774e-05, "loss": 0.2306, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5710 }, { "epoch": 7.231352718078382, "grad_norm": 1.1808239221572876, "learning_rate": 6.831855228681676e-05, "loss": 0.2313, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5720 }, { "epoch": 7.243994943109987, "grad_norm": 1.161959171295166, "learning_rate": 6.816039913318605e-05, "loss": 0.2365, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5730 }, { "epoch": 7.256637168141593, "grad_norm": 1.102596402168274, "learning_rate": 6.800367075456027e-05, "loss": 0.2247, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5740 }, { "epoch": 7.269279393173198, "grad_norm": 0.9597683548927307, "learning_rate": 6.78483707619374e-05, "loss": 0.216, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5750 }, { "epoch": 7.281921618204804, "grad_norm": 1.1682482957839966, "learning_rate": 6.769450273340572e-05, "loss": 0.2481, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5760 }, { "epoch": 7.29456384323641, "grad_norm": 1.043906807899475, "learning_rate": 6.754207021406114e-05, "loss": 0.2284, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5770 }, { "epoch": 7.307206068268015, "grad_norm": 1.110894799232483, "learning_rate": 6.73910767159258e-05, "loss": 0.2605, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5780 }, { "epoch": 7.3198482932996205, "grad_norm": 1.06911039352417, "learning_rate": 6.724152571786693e-05, "loss": 0.2263, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5790 }, { "epoch": 7.332490518331226, "grad_norm": 1.144773006439209, "learning_rate": 6.709342066551677e-05, "loss": 0.2363, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5800 }, { "epoch": 7.345132743362832, "grad_norm": 3.6639792919158936, "learning_rate": 6.694676497119325e-05, "loss": 0.249, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5810 }, { "epoch": 7.357774968394438, "grad_norm": 0.9481773376464844, "learning_rate": 6.680156201382128e-05, "loss": 0.2531, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5820 }, { "epoch": 7.370417193426043, "grad_norm": 1.118088960647583, "learning_rate": 6.66578151388549e-05, "loss": 0.2158, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5830 }, { "epoch": 7.383059418457648, "grad_norm": 1.0164135694503784, "learning_rate": 6.651552765820028e-05, "loss": 0.256, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5840 }, { "epoch": 7.395701643489254, "grad_norm": 1.046364188194275, "learning_rate": 6.637470285013933e-05, "loss": 0.2344, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5850 }, { "epoch": 7.40834386852086, "grad_norm": 1.0682607889175415, "learning_rate": 6.623534395925426e-05, "loss": 0.2189, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5860 }, { "epoch": 7.420986093552465, "grad_norm": 1.1149200201034546, "learning_rate": 6.609745419635272e-05, "loss": 0.2313, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5870 }, { "epoch": 7.433628318584071, "grad_norm": 1.2037601470947266, "learning_rate": 6.596103673839385e-05, "loss": 0.239, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5880 }, { "epoch": 7.446270543615676, "grad_norm": 1.2147172689437866, "learning_rate": 6.582609472841519e-05, "loss": 0.253, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5890 }, { "epoch": 7.458912768647282, "grad_norm": 1.061748743057251, "learning_rate": 6.569263127546012e-05, "loss": 0.2491, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5900 }, { "epoch": 7.471554993678888, "grad_norm": 1.1806966066360474, "learning_rate": 6.556064945450633e-05, "loss": 0.2307, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5910 }, { "epoch": 7.484197218710493, "grad_norm": 1.0720311403274536, "learning_rate": 6.54301523063949e-05, "loss": 0.2567, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5920 }, { "epoch": 7.496839443742099, "grad_norm": 1.1361720561981201, "learning_rate": 6.530114283776029e-05, "loss": 0.221, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5930 }, { "epoch": 7.509481668773704, "grad_norm": 1.7318781614303589, "learning_rate": 6.517362402096104e-05, "loss": 0.2343, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5940 }, { "epoch": 7.522123893805309, "grad_norm": 1.2448699474334717, "learning_rate": 6.504759879401134e-05, "loss": 0.2487, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5950 }, { "epoch": 7.534766118836915, "grad_norm": 1.144116997718811, "learning_rate": 6.492307006051322e-05, "loss": 0.2246, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5960 }, { "epoch": 7.547408343868521, "grad_norm": 1.121053695678711, "learning_rate": 6.480004068958982e-05, "loss": 0.2345, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5970 }, { "epoch": 7.5600505689001265, "grad_norm": 0.9634986519813538, "learning_rate": 6.46785135158191e-05, "loss": 0.2206, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5980 }, { "epoch": 7.572692793931732, "grad_norm": 1.0400941371917725, "learning_rate": 6.455849133916868e-05, "loss": 0.2259, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 5990 }, { "epoch": 7.585335018963337, "grad_norm": 1.1151084899902344, "learning_rate": 6.44399769249313e-05, "loss": 0.2412, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6000 }, { "epoch": 7.597977243994944, "grad_norm": 1.2084640264511108, "learning_rate": 6.432297300366104e-05, "loss": 0.2469, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6010 }, { "epoch": 7.610619469026549, "grad_norm": 1.1408836841583252, "learning_rate": 6.420748227111045e-05, "loss": 0.2276, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6020 }, { "epoch": 7.623261694058154, "grad_norm": 1.132438063621521, "learning_rate": 6.409350738816844e-05, "loss": 0.2476, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6030 }, { "epoch": 7.63590391908976, "grad_norm": 1.0751878023147583, "learning_rate": 6.398105098079903e-05, "loss": 0.2527, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6040 }, { "epoch": 7.648546144121365, "grad_norm": 1.1522191762924194, "learning_rate": 6.387011563998073e-05, "loss": 0.2596, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6050 }, { "epoch": 7.6611883691529705, "grad_norm": 1.0497066974639893, "learning_rate": 6.376070392164694e-05, "loss": 0.2534, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6060 }, { "epoch": 7.673830594184577, "grad_norm": 2.2555932998657227, "learning_rate": 6.3652818346627e-05, "loss": 0.2413, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6070 }, { "epoch": 7.686472819216182, "grad_norm": 0.9901424646377563, "learning_rate": 6.354646140058816e-05, "loss": 0.2442, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6080 }, { "epoch": 7.699115044247788, "grad_norm": 1.066794753074646, "learning_rate": 6.344163553397834e-05, "loss": 0.2428, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6090 }, { "epoch": 7.711757269279393, "grad_norm": 1.0979070663452148, "learning_rate": 6.333834316196953e-05, "loss": 0.2457, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6100 }, { "epoch": 7.724399494310998, "grad_norm": 1.1070395708084106, "learning_rate": 6.323658666440228e-05, "loss": 0.25, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6110 }, { "epoch": 7.737041719342605, "grad_norm": 1.0736275911331177, "learning_rate": 6.313636838573086e-05, "loss": 0.2524, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6120 }, { "epoch": 7.74968394437421, "grad_norm": 1.217236042022705, "learning_rate": 6.303769063496915e-05, "loss": 0.2707, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6130 }, { "epoch": 7.762326169405815, "grad_norm": 1.180005669593811, "learning_rate": 6.294055568563754e-05, "loss": 0.2405, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6140 }, { "epoch": 7.774968394437421, "grad_norm": 1.116621971130371, "learning_rate": 6.28449657757105e-05, "loss": 0.2469, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6150 }, { "epoch": 7.787610619469026, "grad_norm": 1.0715476274490356, "learning_rate": 6.2750923107565e-05, "loss": 0.2482, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6160 }, { "epoch": 7.8002528445006325, "grad_norm": 1.0930267572402954, "learning_rate": 6.265842984792986e-05, "loss": 0.2872, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6170 }, { "epoch": 7.812895069532238, "grad_norm": 1.232857346534729, "learning_rate": 6.25674881278357e-05, "loss": 0.2536, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6180 }, { "epoch": 7.825537294563843, "grad_norm": 1.1025636196136475, "learning_rate": 6.247810004256595e-05, "loss": 0.2513, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6190 }, { "epoch": 7.838179519595449, "grad_norm": 2.9798877239227295, "learning_rate": 6.23902676516085e-05, "loss": 0.2668, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6200 }, { "epoch": 7.850821744627054, "grad_norm": 1.3299516439437866, "learning_rate": 6.230399297860826e-05, "loss": 0.2637, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6210 }, { "epoch": 7.86346396965866, "grad_norm": 1.1211531162261963, "learning_rate": 6.221927801132061e-05, "loss": 0.2385, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6220 }, { "epoch": 7.876106194690266, "grad_norm": 1.2004640102386475, "learning_rate": 6.213612470156552e-05, "loss": 0.2594, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6230 }, { "epoch": 7.888748419721871, "grad_norm": 1.0749276876449585, "learning_rate": 6.205453496518261e-05, "loss": 0.2551, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6240 }, { "epoch": 7.9013906447534765, "grad_norm": 1.2336843013763428, "learning_rate": 6.197451068198699e-05, "loss": 0.284, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6250 }, { "epoch": 7.914032869785082, "grad_norm": 1.194594383239746, "learning_rate": 6.189605369572598e-05, "loss": 0.2442, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6260 }, { "epoch": 7.926675094816687, "grad_norm": 1.0185377597808838, "learning_rate": 6.181916581403667e-05, "loss": 0.2523, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6270 }, { "epoch": 7.939317319848294, "grad_norm": 1.0479494333267212, "learning_rate": 6.174384880840409e-05, "loss": 0.2545, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6280 }, { "epoch": 7.951959544879899, "grad_norm": 1.0949984788894653, "learning_rate": 6.167010441412064e-05, "loss": 0.2513, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6290 }, { "epoch": 7.964601769911504, "grad_norm": 1.1074668169021606, "learning_rate": 6.159793433024597e-05, "loss": 0.2601, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6300 }, { "epoch": 7.97724399494311, "grad_norm": 1.2110705375671387, "learning_rate": 6.152734021956782e-05, "loss": 0.2685, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6310 }, { "epoch": 7.989886219974716, "grad_norm": 1.0655533075332642, "learning_rate": 6.145832370856379e-05, "loss": 0.2444, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6320 }, { "epoch": 8.002528445006321, "grad_norm": 0.8317849040031433, "learning_rate": 6.139088638736378e-05, "loss": 0.2416, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6330 }, { "epoch": 8.015170670037927, "grad_norm": 1.3935742378234863, "learning_rate": 6.132502980971345e-05, "loss": 0.1735, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6340 }, { "epoch": 8.027812895069532, "grad_norm": 1.0203521251678467, "learning_rate": 6.12607554929383e-05, "loss": 0.1674, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6350 }, { "epoch": 8.040455120101138, "grad_norm": 1.0844451189041138, "learning_rate": 6.119806491790886e-05, "loss": 0.1563, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6360 }, { "epoch": 8.053097345132743, "grad_norm": 1.0661518573760986, "learning_rate": 6.113695952900643e-05, "loss": 0.1579, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6370 }, { "epoch": 8.065739570164348, "grad_norm": 0.9967635869979858, "learning_rate": 6.107744073408987e-05, "loss": 0.1601, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6380 }, { "epoch": 8.078381795195954, "grad_norm": 1.1493229866027832, "learning_rate": 6.10195099044632e-05, "loss": 0.1586, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6390 }, { "epoch": 8.09102402022756, "grad_norm": 3.224154233932495, "learning_rate": 6.096316837484391e-05, "loss": 0.188, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6400 }, { "epoch": 8.103666245259166, "grad_norm": 1.0153775215148926, "learning_rate": 6.090841744333229e-05, "loss": 0.1821, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6410 }, { "epoch": 8.116308470290772, "grad_norm": 1.5251129865646362, "learning_rate": 6.0855258371381465e-05, "loss": 0.195, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6420 }, { "epoch": 8.128950695322377, "grad_norm": 1.1285451650619507, "learning_rate": 6.0803692383768375e-05, "loss": 0.1559, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6430 }, { "epoch": 8.141592920353983, "grad_norm": 1.0399773120880127, "learning_rate": 6.075372066856554e-05, "loss": 0.1609, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6440 }, { "epoch": 8.154235145385588, "grad_norm": 1.1441960334777832, "learning_rate": 6.07053443771137e-05, "loss": 0.1731, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6450 }, { "epoch": 8.166877370417193, "grad_norm": 1.0369312763214111, "learning_rate": 6.065856462399524e-05, "loss": 0.1661, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6460 }, { "epoch": 8.179519595448799, "grad_norm": 1.1654633283615112, "learning_rate": 6.061338248700856e-05, "loss": 0.2005, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6470 }, { "epoch": 8.192161820480404, "grad_norm": 1.0257656574249268, "learning_rate": 6.0569799007143233e-05, "loss": 0.1688, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6480 }, { "epoch": 8.20480404551201, "grad_norm": 1.05653977394104, "learning_rate": 6.052781518855601e-05, "loss": 0.1732, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6490 }, { "epoch": 8.217446270543615, "grad_norm": 0.9420139193534851, "learning_rate": 6.0487431998547705e-05, "loss": 0.1704, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6500 }, { "epoch": 8.230088495575222, "grad_norm": 1.0948173999786377, "learning_rate": 6.044865036754086e-05, "loss": 0.178, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6510 }, { "epoch": 8.242730720606827, "grad_norm": 1.1382850408554077, "learning_rate": 6.0411471189058353e-05, "loss": 0.1945, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6520 }, { "epoch": 8.255372945638433, "grad_norm": 1.1092077493667603, "learning_rate": 6.037589531970283e-05, "loss": 0.1628, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6530 }, { "epoch": 8.268015170670038, "grad_norm": 1.0578278303146362, "learning_rate": 6.0341923579136886e-05, "loss": 0.1815, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6540 }, { "epoch": 8.280657395701644, "grad_norm": 1.170258641242981, "learning_rate": 6.030955675006428e-05, "loss": 0.1633, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6550 }, { "epoch": 8.293299620733249, "grad_norm": 1.1795989274978638, "learning_rate": 6.027879557821183e-05, "loss": 0.1987, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6560 }, { "epoch": 8.305941845764854, "grad_norm": 1.1237478256225586, "learning_rate": 6.0249640772312264e-05, "loss": 0.1878, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6570 }, { "epoch": 8.31858407079646, "grad_norm": 1.2054628133773804, "learning_rate": 6.022209300408786e-05, "loss": 0.1765, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6580 }, { "epoch": 8.331226295828065, "grad_norm": 1.16087806224823, "learning_rate": 6.019615290823503e-05, "loss": 0.1779, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6590 }, { "epoch": 8.34386852085967, "grad_norm": 1.0747262239456177, "learning_rate": 6.017182108240963e-05, "loss": 0.1741, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6600 }, { "epoch": 8.356510745891278, "grad_norm": 1.171136498451233, "learning_rate": 6.014909808721324e-05, "loss": 0.1928, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6610 }, { "epoch": 8.369152970922883, "grad_norm": 1.09550940990448, "learning_rate": 6.0127984446180196e-05, "loss": 0.1745, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6620 }, { "epoch": 8.381795195954489, "grad_norm": 1.184849739074707, "learning_rate": 6.010848064576561e-05, "loss": 0.1889, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6630 }, { "epoch": 8.394437420986094, "grad_norm": 1.1877614259719849, "learning_rate": 6.009058713533404e-05, "loss": 0.1859, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6640 }, { "epoch": 8.4070796460177, "grad_norm": 1.2223458290100098, "learning_rate": 6.007430432714928e-05, "loss": 0.1901, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6650 }, { "epoch": 8.419721871049305, "grad_norm": 1.1974024772644043, "learning_rate": 6.005963259636473e-05, "loss": 0.2126, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6660 }, { "epoch": 8.43236409608091, "grad_norm": 1.180246353149414, "learning_rate": 6.0046572281014854e-05, "loss": 0.1883, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6670 }, { "epoch": 8.445006321112515, "grad_norm": 1.1506062746047974, "learning_rate": 6.003512368200732e-05, "loss": 0.186, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6680 }, { "epoch": 8.45764854614412, "grad_norm": 1.1646850109100342, "learning_rate": 6.002528706311613e-05, "loss": 0.1949, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6690 }, { "epoch": 8.470290771175726, "grad_norm": 1.0622496604919434, "learning_rate": 6.001706265097548e-05, "loss": 0.1958, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6700 }, { "epoch": 8.482932996207332, "grad_norm": 1.1623327732086182, "learning_rate": 6.0010450635074554e-05, "loss": 0.1857, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6710 }, { "epoch": 8.495575221238939, "grad_norm": 1.1403242349624634, "learning_rate": 6.000545116775322e-05, "loss": 0.1894, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6720 }, { "epoch": 8.508217446270544, "grad_norm": 1.1317553520202637, "learning_rate": 6.000206436419843e-05, "loss": 0.1847, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6730 }, { "epoch": 8.52085967130215, "grad_norm": 1.1617845296859741, "learning_rate": 6.000029030244164e-05, "loss": 0.1802, "memory/device_mem_reserved(gib)": 34.32, "memory/max_mem_active(gib)": 33.22, "memory/max_mem_allocated(gib)": 33.22, "step": 6740 } ], "logging_steps": 10, "max_steps": 6745, "num_input_tokens_seen": 0, "num_train_epochs": 9, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.882959280114762e+18, "train_batch_size": 28, "trial_name": null, "trial_params": null }