{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.037392884964944, "eval_steps": 1000, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 775.203125, "epoch": 0.002077382498052454, "grad_norm": 0.16490910947322845, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0, "reward": 0.2167968787252903, "reward_std": 0.11324757407419384, "rewards/argmax_reward_func": 0.0625, "rewards/format_reward_func": 0.154296875, "step": 1 }, { "completion_length": 820.609375, "epoch": 0.004154764996104908, "grad_norm": 0.15733271837234497, "kl": 0.0, "learning_rate": 2e-05, "loss": 0.0, "reward": 0.1472656298428774, "reward_std": 0.020439805870410055, "rewards/argmax_reward_func": 0.0, "rewards/format_reward_func": 0.14726562798023224, "step": 2 }, { "completion_length": 901.25, "epoch": 0.006232147494157362, "grad_norm": 0.14691142737865448, "kl": 0.0010660013067536056, "learning_rate": 4e-05, "loss": 0.0, "reward": 0.20703125, "reward_std": 0.11269514623563737, "rewards/argmax_reward_func": 0.0625, "rewards/format_reward_func": 0.14453125, "step": 3 }, { "completion_length": 873.015625, "epoch": 0.008309529992209816, "grad_norm": 0.14998185634613037, "kl": 0.0019050340633839369, "learning_rate": 6e-05, "loss": 0.0, "reward": 0.2011718824505806, "reward_std": 0.09004563023336232, "rewards/argmax_reward_func": 0.046875, "rewards/format_reward_func": 0.1542968787252903, "step": 4 }, { "completion_length": 870.546875, "epoch": 0.01038691249026227, "grad_norm": 0.1567591279745102, "kl": 0.005349995743017644, "learning_rate": 8e-05, "loss": 0.0, "reward": 0.2285156361758709, "reward_std": 0.1110378596931696, "rewards/argmax_reward_func": 0.0625, "rewards/format_reward_func": 0.1660156287252903, "step": 5 }, { "completion_length": 849.125, "epoch": 0.012464294988314724, "grad_norm": 0.10938515514135361, "kl": 0.01296996301971376, "learning_rate": 0.0001, "loss": 0.0, "reward": 0.24414063058793545, "reward_std": 0.0999893163680099, "rewards/argmax_reward_func": 0.0625, "rewards/format_reward_func": 0.1816406324505806, "step": 6 }, { "completion_length": 901.015625, "epoch": 0.014541677486367177, "grad_norm": 0.12581659853458405, "kl": 0.02171943092253059, "learning_rate": 9.999973058889791e-05, "loss": 0.0, "reward": 0.2585937548428774, "reward_std": 0.12816310487687588, "rewards/argmax_reward_func": 0.078125, "rewards/format_reward_func": 0.18046875298023224, "step": 7 }, { "completion_length": 916.671875, "epoch": 0.01661905998441963, "grad_norm": 0.12178487330675125, "kl": 0.04081101668998599, "learning_rate": 9.999892235849491e-05, "loss": 0.0, "reward": 0.3437500111758709, "reward_std": 0.1900349531788379, "rewards/argmax_reward_func": 0.15625, "rewards/format_reward_func": 0.1875, "step": 8 }, { "completion_length": 803.90625, "epoch": 0.018696442482472084, "grad_norm": 0.12499672174453735, "kl": 0.06826442573219538, "learning_rate": 9.999757531750085e-05, "loss": 0.0, "reward": 0.45625001564621925, "reward_std": 0.25411650398746133, "rewards/argmax_reward_func": 0.265625, "rewards/format_reward_func": 0.1906250026077032, "step": 9 }, { "completion_length": 953.875, "epoch": 0.02077382498052454, "grad_norm": 0.11061865091323853, "kl": 0.06516677932813764, "learning_rate": 9.999568948043205e-05, "loss": 0.0, "reward": 0.3804687615483999, "reward_std": 0.23091456340625882, "rewards/argmax_reward_func": 0.1875, "rewards/format_reward_func": 0.19296875037252903, "step": 10 }, { "completion_length": 824.546875, "epoch": 0.022851207478576992, "grad_norm": 0.10025237500667572, "kl": 0.10202133795246482, "learning_rate": 9.999326486761114e-05, "loss": 0.0001, "reward": 0.4562500212341547, "reward_std": 0.203293202444911, "rewards/argmax_reward_func": 0.265625, "rewards/format_reward_func": 0.1906250026077032, "step": 11 }, { "completion_length": 925.234375, "epoch": 0.02492858997662945, "grad_norm": 0.12423845380544662, "kl": 0.14641187246888876, "learning_rate": 9.99903015051668e-05, "loss": 0.0001, "reward": 0.6261719018220901, "reward_std": 0.22925727342953905, "rewards/argmax_reward_func": 0.4375, "rewards/format_reward_func": 0.1886718738824129, "step": 12 }, { "completion_length": 810.546875, "epoch": 0.0270059724746819, "grad_norm": 0.1263190507888794, "kl": 0.23557536769658327, "learning_rate": 9.998679942503358e-05, "loss": 0.0001, "reward": 0.5953125320374966, "reward_std": 0.2717941626906395, "rewards/argmax_reward_func": 0.40625, "rewards/format_reward_func": 0.18906250409781933, "step": 13 }, { "completion_length": 738.953125, "epoch": 0.029083354972734354, "grad_norm": 0.09476204961538315, "kl": 0.29587008990347385, "learning_rate": 9.998275866495138e-05, "loss": 0.0001, "reward": 0.7289062887430191, "reward_std": 0.18009126000106335, "rewards/argmax_reward_func": 0.53125, "rewards/format_reward_func": 0.19765625335276127, "step": 14 }, { "completion_length": 699.109375, "epoch": 0.03116073747078681, "grad_norm": 0.15413929522037506, "kl": 0.2644388508051634, "learning_rate": 9.997817926846529e-05, "loss": 0.0001, "reward": 0.6968750357627869, "reward_std": 0.4021669775247574, "rewards/argmax_reward_func": 0.5, "rewards/format_reward_func": 0.19687500223517418, "step": 15 }, { "completion_length": 716.4375, "epoch": 0.03323811996883926, "grad_norm": 0.13675570487976074, "kl": 0.41714945435523987, "learning_rate": 9.99730612849249e-05, "loss": 0.0002, "reward": 0.6320312805473804, "reward_std": 0.27289901627227664, "rewards/argmax_reward_func": 0.4375, "rewards/format_reward_func": 0.19453125074505806, "step": 16 }, { "completion_length": 688.609375, "epoch": 0.03531550246689172, "grad_norm": 0.14246560633182526, "kl": 0.35029047913849354, "learning_rate": 9.996740476948385e-05, "loss": 0.0002, "reward": 0.6304687857627869, "reward_std": 0.31930290907621384, "rewards/argmax_reward_func": 0.4375, "rewards/format_reward_func": 0.19296875223517418, "step": 17 }, { "completion_length": 630.828125, "epoch": 0.03739288496494417, "grad_norm": 2.1836376190185547, "kl": 10.275608837604523, "learning_rate": 9.996120978309931e-05, "loss": 0.0051, "reward": 0.5742187947034836, "reward_std": 0.39885240606963634, "rewards/argmax_reward_func": 0.375, "rewards/format_reward_func": 0.1992187537252903, "step": 18 }, { "completion_length": 647.890625, "epoch": 0.039470267462996624, "grad_norm": 0.1236676499247551, "kl": 0.366399560123682, "learning_rate": 9.995447639253115e-05, "loss": 0.0002, "reward": 0.7765625417232513, "reward_std": 0.2894718423485756, "rewards/argmax_reward_func": 0.578125, "rewards/format_reward_func": 0.19843750074505806, "step": 19 }, { "completion_length": 567.578125, "epoch": 0.04154764996104908, "grad_norm": 0.12248539924621582, "kl": 0.28800770081579685, "learning_rate": 9.994720467034142e-05, "loss": 0.0001, "reward": 0.807812537997961, "reward_std": 0.24527766928076744, "rewards/argmax_reward_func": 0.609375, "rewards/format_reward_func": 0.1984375026077032, "step": 20 }, { "completion_length": 538.796875, "epoch": 0.04362503245910153, "grad_norm": 0.14135704934597015, "kl": 0.5838185884058475, "learning_rate": 9.993939469489342e-05, "loss": 0.0003, "reward": 0.6835937947034836, "reward_std": 0.24417280592024326, "rewards/argmax_reward_func": 0.484375, "rewards/format_reward_func": 0.19921875186264515, "step": 21 }, { "completion_length": 534.40625, "epoch": 0.045702414957153985, "grad_norm": 0.16315752267837524, "kl": 0.37346063926815987, "learning_rate": 9.993104655035088e-05, "loss": 0.0002, "reward": 0.6835937909781933, "reward_std": 0.37675532698631287, "rewards/argmax_reward_func": 0.484375, "rewards/format_reward_func": 0.1992187537252903, "step": 22 }, { "completion_length": 583.4375, "epoch": 0.04777979745520644, "grad_norm": 0.13479600846767426, "kl": 0.5061899088323116, "learning_rate": 9.992216032667716e-05, "loss": 0.0003, "reward": 0.5878906548023224, "reward_std": 0.2911291141062975, "rewards/argmax_reward_func": 0.390625, "rewards/format_reward_func": 0.19726562686264515, "step": 23 }, { "completion_length": 532.203125, "epoch": 0.0498571799532589, "grad_norm": 0.12497097253799438, "kl": 0.5098075568675995, "learning_rate": 9.991273611963412e-05, "loss": 0.0003, "reward": 0.8250000476837158, "reward_std": 0.22097086533904076, "rewards/argmax_reward_func": 0.625, "rewards/format_reward_func": 0.20000000298023224, "step": 24 }, { "completion_length": 503.4375, "epoch": 0.051934562451311346, "grad_norm": 0.153394415974617, "kl": 0.35232703387737274, "learning_rate": 9.990277403078122e-05, "loss": 0.0002, "reward": 0.7156250439584255, "reward_std": 0.3314562924206257, "rewards/argmax_reward_func": 0.515625, "rewards/format_reward_func": 0.20000000298023224, "step": 25 }, { "completion_length": 491.25, "epoch": 0.0540119449493638, "grad_norm": 0.15910868346691132, "kl": 0.41421468555927277, "learning_rate": 9.989227416747434e-05, "loss": 0.0002, "reward": 0.7625000476837158, "reward_std": 0.35355337895452976, "rewards/argmax_reward_func": 0.5625, "rewards/format_reward_func": 0.20000000298023224, "step": 26 }, { "completion_length": 485.390625, "epoch": 0.05608932744741626, "grad_norm": 34.968101501464844, "kl": 504.1205723620951, "learning_rate": 9.988123664286469e-05, "loss": 0.2521, "reward": 0.6375000439584255, "reward_std": 0.39774755388498306, "rewards/argmax_reward_func": 0.4375, "rewards/format_reward_func": 0.20000000298023224, "step": 27 }, { "completion_length": 543.859375, "epoch": 0.05816670994546871, "grad_norm": 0.1266699880361557, "kl": 0.3936588950455189, "learning_rate": 9.98696615758975e-05, "loss": 0.0002, "reward": 0.7296875417232513, "reward_std": 0.2673747483640909, "rewards/argmax_reward_func": 0.53125, "rewards/format_reward_func": 0.1984375026077032, "step": 28 }, { "completion_length": 520.84375, "epoch": 0.06024409244352116, "grad_norm": 0.1384185552597046, "kl": 0.41820336878299713, "learning_rate": 9.985754909131085e-05, "loss": 0.0002, "reward": 0.6523437947034836, "reward_std": 0.2883669789880514, "rewards/argmax_reward_func": 0.453125, "rewards/format_reward_func": 0.19921875186264515, "step": 29 }, { "completion_length": 566.515625, "epoch": 0.06232147494157362, "grad_norm": 0.12411382049322128, "kl": 0.37708618491888046, "learning_rate": 9.984489931963428e-05, "loss": 0.0002, "reward": 0.7304687909781933, "reward_std": 0.26626989617943764, "rewards/argmax_reward_func": 0.53125, "rewards/format_reward_func": 0.1992187537252903, "step": 30 }, { "completion_length": 557.71875, "epoch": 0.06439885743962608, "grad_norm": 0.1417299211025238, "kl": 0.5239567384123802, "learning_rate": 9.98317123971873e-05, "loss": 0.0003, "reward": 0.5867187865078449, "reward_std": 0.33698057383298874, "rewards/argmax_reward_func": 0.390625, "rewards/format_reward_func": 0.19609375298023224, "step": 31 }, { "completion_length": 571.46875, "epoch": 0.06647623993767852, "grad_norm": 0.12581190466880798, "kl": 0.3905966766178608, "learning_rate": 9.981798846607808e-05, "loss": 0.0002, "reward": 0.8238281756639481, "reward_std": 0.31101649068295956, "rewards/argmax_reward_func": 0.625, "rewards/format_reward_func": 0.19882812909781933, "step": 32 }, { "completion_length": 617.171875, "epoch": 0.06855362243573097, "grad_norm": 0.119756318628788, "kl": 0.37264879420399666, "learning_rate": 9.980372767420177e-05, "loss": 0.0002, "reward": 0.6210937835276127, "reward_std": 0.2883669789880514, "rewards/argmax_reward_func": 0.421875, "rewards/format_reward_func": 0.1992187537252903, "step": 33 }, { "completion_length": 604.265625, "epoch": 0.07063100493378344, "grad_norm": 0.12877410650253296, "kl": 0.4410099685192108, "learning_rate": 9.978893017523903e-05, "loss": 0.0002, "reward": 0.6687500476837158, "reward_std": 0.3535533808171749, "rewards/argmax_reward_func": 0.46875, "rewards/format_reward_func": 0.20000000298023224, "step": 34 }, { "completion_length": 664.875, "epoch": 0.07270838743183589, "grad_norm": 0.10366171598434448, "kl": 0.5219907499849796, "learning_rate": 9.977359612865423e-05, "loss": 0.0003, "reward": 0.726562537252903, "reward_std": 0.26958445459604263, "rewards/argmax_reward_func": 0.53125, "rewards/format_reward_func": 0.1953125037252903, "step": 35 }, { "completion_length": 658.515625, "epoch": 0.07478576992988833, "grad_norm": 0.1170380637049675, "kl": 0.46299856156110764, "learning_rate": 9.97577256996939e-05, "loss": 0.0002, "reward": 0.8226562887430191, "reward_std": 0.3126737759448588, "rewards/argmax_reward_func": 0.625, "rewards/format_reward_func": 0.19765625335276127, "step": 36 }, { "completion_length": 621.890625, "epoch": 0.0768631524279408, "grad_norm": 0.09539435803890228, "kl": 0.39572376012802124, "learning_rate": 9.974131905938483e-05, "loss": 0.0002, "reward": 0.851562537252903, "reward_std": 0.18119611439760774, "rewards/argmax_reward_func": 0.65625, "rewards/format_reward_func": 0.1953125037252903, "step": 37 }, { "completion_length": 616.671875, "epoch": 0.07894053492599325, "grad_norm": 0.1066877692937851, "kl": 0.3970871977508068, "learning_rate": 9.972437638453227e-05, "loss": 0.0002, "reward": 0.5734375342726707, "reward_std": 0.2673747483640909, "rewards/argmax_reward_func": 0.375, "rewards/format_reward_func": 0.1984375026077032, "step": 38 }, { "completion_length": 620.515625, "epoch": 0.0810179174240457, "grad_norm": 0.09872303903102875, "kl": 0.4494887478649616, "learning_rate": 9.970689785771798e-05, "loss": 0.0002, "reward": 0.8539062887430191, "reward_std": 0.22428543493151665, "rewards/argmax_reward_func": 0.65625, "rewards/format_reward_func": 0.19765625521540642, "step": 39 }, { "completion_length": 646.25, "epoch": 0.08309529992209816, "grad_norm": 0.10916193574666977, "kl": 0.3997967578470707, "learning_rate": 9.968888366729835e-05, "loss": 0.0002, "reward": 0.7867187820374966, "reward_std": 0.31156892515718937, "rewards/argmax_reward_func": 0.59375, "rewards/format_reward_func": 0.19296875223517418, "step": 40 }, { "completion_length": 636.578125, "epoch": 0.08517268242015061, "grad_norm": 0.109690822660923, "kl": 0.4680747017264366, "learning_rate": 9.967033400740227e-05, "loss": 0.0002, "reward": 0.7125000357627869, "reward_std": 0.2916815411299467, "rewards/argmax_reward_func": 0.515625, "rewards/format_reward_func": 0.19687500409781933, "step": 41 }, { "completion_length": 633.8125, "epoch": 0.08725006491820306, "grad_norm": 0.11978733539581299, "kl": 0.42677244916558266, "learning_rate": 9.965124907792915e-05, "loss": 0.0002, "reward": 0.6804687902331352, "reward_std": 0.3369805682450533, "rewards/argmax_reward_func": 0.484375, "rewards/format_reward_func": 0.19609375298023224, "step": 42 }, { "completion_length": 641.328125, "epoch": 0.08932744741625552, "grad_norm": 0.10323718935251236, "kl": 0.4870793893933296, "learning_rate": 9.963162908454664e-05, "loss": 0.0002, "reward": 0.6820312812924385, "reward_std": 0.2905766926705837, "rewards/argmax_reward_func": 0.484375, "rewards/format_reward_func": 0.19765625335276127, "step": 43 }, { "completion_length": 672.90625, "epoch": 0.09140482991430797, "grad_norm": 0.10177203267812729, "kl": 0.4337821826338768, "learning_rate": 9.96114742386885e-05, "loss": 0.0002, "reward": 0.7445312850177288, "reward_std": 0.3347708657383919, "rewards/argmax_reward_func": 0.546875, "rewards/format_reward_func": 0.19765625335276127, "step": 44 }, { "completion_length": 639.203125, "epoch": 0.09348221241236043, "grad_norm": 0.07864588499069214, "kl": 0.6409419141709805, "learning_rate": 9.95907847575523e-05, "loss": 0.0003, "reward": 0.6367187835276127, "reward_std": 0.17788154818117619, "rewards/argmax_reward_func": 0.4375, "rewards/format_reward_func": 0.1992187537252903, "step": 45 }, { "completion_length": 662.859375, "epoch": 0.09555959491041288, "grad_norm": 0.1133044883608818, "kl": 0.49651604518294334, "learning_rate": 9.95695608640971e-05, "loss": 0.0002, "reward": 0.6664062887430191, "reward_std": 0.31267377361655235, "rewards/argmax_reward_func": 0.46875, "rewards/format_reward_func": 0.19765624962747097, "step": 46 }, { "completion_length": 615.1875, "epoch": 0.09763697740846533, "grad_norm": 0.09465198963880539, "kl": 0.42672090977430344, "learning_rate": 9.954780278704097e-05, "loss": 0.0002, "reward": 0.8250000476837158, "reward_std": 0.2651650384068489, "rewards/argmax_reward_func": 0.625, "rewards/format_reward_func": 0.20000000298023224, "step": 47 }, { "completion_length": 624.234375, "epoch": 0.0997143599065178, "grad_norm": 0.12003368884325027, "kl": 0.45439790561795235, "learning_rate": 9.952551076085864e-05, "loss": 0.0002, "reward": 0.6531250402331352, "reward_std": 0.375650467351079, "rewards/argmax_reward_func": 0.453125, "rewards/format_reward_func": 0.20000000298023224, "step": 48 }, { "completion_length": 657.6875, "epoch": 0.10179174240457024, "grad_norm": 0.10603732615709305, "kl": 0.4523283280432224, "learning_rate": 9.950268502577884e-05, "loss": 0.0002, "reward": 0.823437537997961, "reward_std": 0.3115689232945442, "rewards/argmax_reward_func": 0.625, "rewards/format_reward_func": 0.19843750074505806, "step": 49 }, { "completion_length": 611.640625, "epoch": 0.10386912490262269, "grad_norm": 0.11261381953954697, "kl": 0.5920008532702923, "learning_rate": 9.947932582778188e-05, "loss": 0.0003, "reward": 0.7765625417232513, "reward_std": 0.33366600796580315, "rewards/argmax_reward_func": 0.578125, "rewards/format_reward_func": 0.1984375026077032, "step": 50 }, { "completion_length": 662.359375, "epoch": 0.10594650740067516, "grad_norm": 0.11194069683551788, "kl": 0.40367136895656586, "learning_rate": 9.94554334185968e-05, "loss": 0.0002, "reward": 0.6375000365078449, "reward_std": 0.35355338267982006, "rewards/argmax_reward_func": 0.4375, "rewards/format_reward_func": 0.20000000298023224, "step": 51 }, { "completion_length": 686.265625, "epoch": 0.1080238898987276, "grad_norm": 0.10016939043998718, "kl": 0.429857462644577, "learning_rate": 9.943100805569887e-05, "loss": 0.0002, "reward": 0.7468750476837158, "reward_std": 0.33145629055798054, "rewards/argmax_reward_func": 0.546875, "rewards/format_reward_func": 0.20000000298023224, "step": 52 }, { "completion_length": 613.890625, "epoch": 0.11010127239678005, "grad_norm": 0.11434896290302277, "kl": 0.41631242260336876, "learning_rate": 9.94060500023066e-05, "loss": 0.0002, "reward": 0.620312537997961, "reward_std": 0.333666006103158, "rewards/argmax_reward_func": 0.421875, "rewards/format_reward_func": 0.1984375026077032, "step": 53 }, { "completion_length": 671.75, "epoch": 0.11217865489483252, "grad_norm": 0.0824907198548317, "kl": 0.4247642531991005, "learning_rate": 9.938055952737907e-05, "loss": 0.0002, "reward": 0.7156250476837158, "reward_std": 0.19887377880513668, "rewards/argmax_reward_func": 0.515625, "rewards/format_reward_func": 0.20000000298023224, "step": 54 }, { "completion_length": 711.078125, "epoch": 0.11425603739288497, "grad_norm": 0.09910566359758377, "kl": 0.46975456923246384, "learning_rate": 9.935453690561297e-05, "loss": 0.0002, "reward": 0.7906250469386578, "reward_std": 0.3137786239385605, "rewards/argmax_reward_func": 0.59375, "rewards/format_reward_func": 0.19687500223517418, "step": 55 }, { "completion_length": 659.5625, "epoch": 0.11633341989093741, "grad_norm": 0.0940733402967453, "kl": 0.42034388333559036, "learning_rate": 9.932798241743961e-05, "loss": 0.0002, "reward": 0.8093750439584255, "reward_std": 0.2872621212154627, "rewards/argmax_reward_func": 0.609375, "rewards/format_reward_func": 0.20000000298023224, "step": 56 }, { "completion_length": 694.796875, "epoch": 0.11841080238898988, "grad_norm": 0.20298048853874207, "kl": 1.01119814068079, "learning_rate": 9.930089634902197e-05, "loss": 0.0005, "reward": 0.714062537997961, "reward_std": 0.28947182931005955, "rewards/argmax_reward_func": 0.515625, "rewards/format_reward_func": 0.19843750447034836, "step": 57 }, { "completion_length": 679.421875, "epoch": 0.12048818488704233, "grad_norm": 0.10724397003650665, "kl": 0.45981432124972343, "learning_rate": 9.927327899225151e-05, "loss": 0.0002, "reward": 0.7156250476837158, "reward_std": 0.375650467351079, "rewards/argmax_reward_func": 0.515625, "rewards/format_reward_func": 0.20000000298023224, "step": 58 }, { "completion_length": 666.390625, "epoch": 0.12256556738509478, "grad_norm": 0.09369952231645584, "kl": 0.5710588954389095, "learning_rate": 9.924513064474519e-05, "loss": 0.0003, "reward": 0.8085937909781933, "reward_std": 0.24417280405759811, "rewards/argmax_reward_func": 0.609375, "rewards/format_reward_func": 0.19921875186264515, "step": 59 }, { "completion_length": 667.3125, "epoch": 0.12464294988314724, "grad_norm": 0.10410826653242111, "kl": 0.6697803623974323, "learning_rate": 9.921645160984206e-05, "loss": 0.0003, "reward": 0.7625000476837158, "reward_std": 0.35355337895452976, "rewards/argmax_reward_func": 0.5625, "rewards/format_reward_func": 0.20000000298023224, "step": 60 }, { "completion_length": 694.453125, "epoch": 0.1267203323811997, "grad_norm": 0.10938889533281326, "kl": 0.42841707170009613, "learning_rate": 9.918724219660013e-05, "loss": 0.0002, "reward": 0.7781250476837158, "reward_std": 0.3756504636257887, "rewards/argmax_reward_func": 0.578125, "rewards/format_reward_func": 0.20000000298023224, "step": 61 }, { "completion_length": 767.796875, "epoch": 0.12879771487925215, "grad_norm": 0.08572812378406525, "kl": 0.42277197539806366, "learning_rate": 9.915750271979305e-05, "loss": 0.0002, "reward": 0.6843750402331352, "reward_std": 0.28726212307810783, "rewards/argmax_reward_func": 0.484375, "rewards/format_reward_func": 0.20000000298023224, "step": 62 }, { "completion_length": 746.703125, "epoch": 0.1308750973773046, "grad_norm": 0.08672405034303665, "kl": 0.4743144288659096, "learning_rate": 9.91272334999066e-05, "loss": 0.0002, "reward": 0.7468750439584255, "reward_std": 0.2872621212154627, "rewards/argmax_reward_func": 0.546875, "rewards/format_reward_func": 0.20000000298023224, "step": 63 }, { "completion_length": 785.140625, "epoch": 0.13295247987535705, "grad_norm": 0.07892299443483353, "kl": 0.5303685143589973, "learning_rate": 9.909643486313533e-05, "loss": 0.0003, "reward": 0.7312500402331352, "reward_std": 0.26516503654420376, "rewards/argmax_reward_func": 0.53125, "rewards/format_reward_func": 0.20000000298023224, "step": 64 }, { "completion_length": 816.015625, "epoch": 0.1350298623734095, "grad_norm": 0.071454256772995, "kl": 0.39743437245488167, "learning_rate": 9.906510714137905e-05, "loss": 0.0002, "reward": 0.6218750402331352, "reward_std": 0.24306794628500938, "rewards/argmax_reward_func": 0.421875, "rewards/format_reward_func": 0.20000000298023224, "step": 65 }, { "completion_length": 836.4375, "epoch": 0.13710724487146195, "grad_norm": 0.08313830941915512, "kl": 0.3903077654540539, "learning_rate": 9.903325067223919e-05, "loss": 0.0002, "reward": 0.6367187909781933, "reward_std": 0.31046406738460064, "rewards/argmax_reward_func": 0.4375, "rewards/format_reward_func": 0.1992187537252903, "step": 66 }, { "completion_length": 787.484375, "epoch": 0.13918462736951442, "grad_norm": 0.08504212647676468, "kl": 0.5619952343404293, "learning_rate": 9.90008657990152e-05, "loss": 0.0003, "reward": 0.7464844211935997, "reward_std": 0.28781455010175705, "rewards/argmax_reward_func": 0.546875, "rewards/format_reward_func": 0.19960937649011612, "step": 67 }, { "completion_length": 807.921875, "epoch": 0.14126200986756687, "grad_norm": 0.08398205786943436, "kl": 0.47908810153603554, "learning_rate": 9.896795287070086e-05, "loss": 0.0002, "reward": 0.7468750476837158, "reward_std": 0.331456296145916, "rewards/argmax_reward_func": 0.546875, "rewards/format_reward_func": 0.20000000298023224, "step": 68 }, { "completion_length": 837.234375, "epoch": 0.14333939236561932, "grad_norm": 0.054244451224803925, "kl": 0.39820099994540215, "learning_rate": 9.893451224198052e-05, "loss": 0.0002, "reward": 0.8406250476837158, "reward_std": 0.15467960573732853, "rewards/argmax_reward_func": 0.640625, "rewards/format_reward_func": 0.20000000298023224, "step": 69 }, { "completion_length": 910.0625, "epoch": 0.14541677486367177, "grad_norm": 0.08078251034021378, "kl": 0.4756108485162258, "learning_rate": 9.890054427322521e-05, "loss": 0.0002, "reward": 0.7781250439584255, "reward_std": 0.331456296145916, "rewards/argmax_reward_func": 0.578125, "rewards/format_reward_func": 0.20000000298023224, "step": 70 }, { "completion_length": 867.84375, "epoch": 0.14749415736172422, "grad_norm": 0.08043571561574936, "kl": 0.3970469869673252, "learning_rate": 9.886604933048888e-05, "loss": 0.0002, "reward": 0.6679687947034836, "reward_std": 0.3104640601668507, "rewards/argmax_reward_func": 0.46875, "rewards/format_reward_func": 0.1992187537252903, "step": 71 }, { "completion_length": 882.265625, "epoch": 0.14957153985977667, "grad_norm": 0.09208390861749649, "kl": 0.40190327540040016, "learning_rate": 9.883102778550434e-05, "loss": 0.0002, "reward": 0.8562500476837158, "reward_std": 0.3977475520223379, "rewards/argmax_reward_func": 0.65625, "rewards/format_reward_func": 0.20000000298023224, "step": 72 }, { "completion_length": 889.78125, "epoch": 0.15164892235782915, "grad_norm": 0.09202940762042999, "kl": 0.38338571041822433, "learning_rate": 9.879548001567931e-05, "loss": 0.0002, "reward": 0.7000000476837158, "reward_std": 0.4419417232275009, "rewards/argmax_reward_func": 0.5, "rewards/format_reward_func": 0.20000000298023224, "step": 73 }, { "completion_length": 942.609375, "epoch": 0.1537263048558816, "grad_norm": 0.06312800943851471, "kl": 0.4080694951117039, "learning_rate": 9.875940640409234e-05, "loss": 0.0002, "reward": 0.5750000402331352, "reward_std": 0.22097086533904076, "rewards/argmax_reward_func": 0.375, "rewards/format_reward_func": 0.20000000298023224, "step": 74 }, { "completion_length": 948.859375, "epoch": 0.15580368735393405, "grad_norm": 0.0712570995092392, "kl": 0.4405221752822399, "learning_rate": 9.872280733948867e-05, "loss": 0.0002, "reward": 0.8085937947034836, "reward_std": 0.2883669827133417, "rewards/argmax_reward_func": 0.609375, "rewards/format_reward_func": 0.19921875186264515, "step": 75 }, { "completion_length": 1053.21875, "epoch": 0.1578810698519865, "grad_norm": 0.05858299508690834, "kl": 0.4397047348320484, "learning_rate": 9.868568321627611e-05, "loss": 0.0002, "reward": 0.7000000383704901, "reward_std": 0.1767766922712326, "rewards/argmax_reward_func": 0.5, "rewards/format_reward_func": 0.20000000298023224, "step": 76 }, { "completion_length": 1019.96875, "epoch": 0.15995845235003894, "grad_norm": 0.07670939713716507, "kl": 0.40835118666291237, "learning_rate": 9.86480344345207e-05, "loss": 0.0002, "reward": 0.7781250439584255, "reward_std": 0.33145629800856113, "rewards/argmax_reward_func": 0.578125, "rewards/format_reward_func": 0.20000000298023224, "step": 77 }, { "completion_length": 1075.15625, "epoch": 0.1620358348480914, "grad_norm": 0.06651510298252106, "kl": 0.42486657947301865, "learning_rate": 9.860986139994239e-05, "loss": 0.0002, "reward": 0.8406250476837158, "reward_std": 0.28726211935281754, "rewards/argmax_reward_func": 0.640625, "rewards/format_reward_func": 0.20000000298023224, "step": 78 }, { "completion_length": 1096.828125, "epoch": 0.16411321734614387, "grad_norm": 0.06264790147542953, "kl": 0.3813174143433571, "learning_rate": 9.857116452391079e-05, "loss": 0.0002, "reward": 0.8875000476837158, "reward_std": 0.2209708634763956, "rewards/argmax_reward_func": 0.6875, "rewards/format_reward_func": 0.20000000298023224, "step": 79 }, { "completion_length": 1159.390625, "epoch": 0.16619059984419632, "grad_norm": 0.06721258908510208, "kl": 0.41810835897922516, "learning_rate": 9.85319442234406e-05, "loss": 0.0002, "reward": 0.7617187947034836, "reward_std": 0.3104640692472458, "rewards/argmax_reward_func": 0.5625, "rewards/format_reward_func": 0.1992187537252903, "step": 80 }, { "completion_length": 1207.40625, "epoch": 0.16826798234224877, "grad_norm": 0.07961631566286087, "kl": 0.353565227240324, "learning_rate": 9.84922009211872e-05, "loss": 0.0002, "reward": 0.8250000476837158, "reward_std": 0.4419417269527912, "rewards/argmax_reward_func": 0.625, "rewards/format_reward_func": 0.20000000298023224, "step": 81 }, { "completion_length": 1267.5625, "epoch": 0.17034536484030122, "grad_norm": 0.06159353628754616, "kl": 0.3608316369354725, "learning_rate": 9.845193504544209e-05, "loss": 0.0002, "reward": 0.6218750365078449, "reward_std": 0.24306795187294483, "rewards/argmax_reward_func": 0.421875, "rewards/format_reward_func": 0.20000000298023224, "step": 82 }, { "completion_length": 1271.1875, "epoch": 0.17242274733835367, "grad_norm": 0.0616268515586853, "kl": 0.3721548244357109, "learning_rate": 9.841114703012817e-05, "loss": 0.0002, "reward": 0.7613281682133675, "reward_std": 0.26682231575250626, "rewards/argmax_reward_func": 0.5625, "rewards/format_reward_func": 0.19882812723517418, "step": 83 }, { "completion_length": 1197.84375, "epoch": 0.17450012983640611, "grad_norm": 0.06743966042995453, "kl": 0.46105678752064705, "learning_rate": 9.836983731479525e-05, "loss": 0.0002, "reward": 0.7625000476837158, "reward_std": 0.30935920774936676, "rewards/argmax_reward_func": 0.5625, "rewards/format_reward_func": 0.20000000298023224, "step": 84 }, { "completion_length": 1239.859375, "epoch": 0.1765775123344586, "grad_norm": 0.07362944632768631, "kl": 0.35114892572164536, "learning_rate": 9.832800634461518e-05, "loss": 0.0002, "reward": 0.6828125417232513, "reward_std": 0.3336660098284483, "rewards/argmax_reward_func": 0.484375, "rewards/format_reward_func": 0.19843750074505806, "step": 85 }, { "completion_length": 1253.21875, "epoch": 0.17865489483251104, "grad_norm": 0.060973405838012695, "kl": 0.3400215059518814, "learning_rate": 9.828565457037703e-05, "loss": 0.0002, "reward": 0.7613281719386578, "reward_std": 0.2668223213404417, "rewards/argmax_reward_func": 0.5625, "rewards/format_reward_func": 0.19882812723517418, "step": 86 }, { "completion_length": 1251.828125, "epoch": 0.1807322773305635, "grad_norm": 0.06071100011467934, "kl": 0.3388819098472595, "learning_rate": 9.824278244848235e-05, "loss": 0.0002, "reward": 0.6843750402331352, "reward_std": 0.28726212307810783, "rewards/argmax_reward_func": 0.484375, "rewards/format_reward_func": 0.20000000298023224, "step": 87 }, { "completion_length": 1177.9375, "epoch": 0.18280965982861594, "grad_norm": 0.07785635441541672, "kl": 0.39376673474907875, "learning_rate": 9.819939044094016e-05, "loss": 0.0002, "reward": 0.6687500476837158, "reward_std": 0.3977475520223379, "rewards/argmax_reward_func": 0.46875, "rewards/format_reward_func": 0.20000000298023224, "step": 88 }, { "completion_length": 1312.515625, "epoch": 0.1848870423266684, "grad_norm": 0.06982032209634781, "kl": 0.3353493846952915, "learning_rate": 9.815547901536201e-05, "loss": 0.0002, "reward": 0.8718750476837158, "reward_std": 0.33145629428327084, "rewards/argmax_reward_func": 0.671875, "rewards/format_reward_func": 0.20000000298023224, "step": 89 }, { "completion_length": 1360.75, "epoch": 0.18696442482472087, "grad_norm": 0.06107737869024277, "kl": 0.45528167858719826, "learning_rate": 9.811104864495691e-05, "loss": 0.0002, "reward": 0.9031250476837158, "reward_std": 0.24306794814765453, "rewards/argmax_reward_func": 0.703125, "rewards/format_reward_func": 0.20000000298023224, "step": 90 }, { "completion_length": 1353.296875, "epoch": 0.18904180732277331, "grad_norm": 0.06465540081262589, "kl": 0.353522464632988, "learning_rate": 9.806609980852628e-05, "loss": 0.0002, "reward": 0.8046875409781933, "reward_std": 0.2938912510871887, "rewards/argmax_reward_func": 0.609375, "rewards/format_reward_func": 0.19531250186264515, "step": 91 }, { "completion_length": 1441.984375, "epoch": 0.19111918982082576, "grad_norm": 0.0610247403383255, "kl": 0.36326174437999725, "learning_rate": 9.802063299045873e-05, "loss": 0.0002, "reward": 0.7468750402331352, "reward_std": 0.19887377880513668, "rewards/argmax_reward_func": 0.546875, "rewards/format_reward_func": 0.20000000298023224, "step": 92 }, { "completion_length": 1441.265625, "epoch": 0.1931965723188782, "grad_norm": 0.05115514621138573, "kl": 0.411540150642395, "learning_rate": 9.797464868072488e-05, "loss": 0.0002, "reward": 0.6812500357627869, "reward_std": 0.2032931987196207, "rewards/argmax_reward_func": 0.484375, "rewards/format_reward_func": 0.19687500223517418, "step": 93 }, { "completion_length": 1397.03125, "epoch": 0.19527395481693066, "grad_norm": 0.053147751837968826, "kl": 0.4489905573427677, "learning_rate": 9.792814737487207e-05, "loss": 0.0002, "reward": 0.7937500439584255, "reward_std": 0.1767766922712326, "rewards/argmax_reward_func": 0.59375, "rewards/format_reward_func": 0.20000000298023224, "step": 94 }, { "completion_length": 1407.515625, "epoch": 0.1973513373149831, "grad_norm": 0.0552426278591156, "kl": 0.3701773174107075, "learning_rate": 9.788112957401903e-05, "loss": 0.0002, "reward": 0.8250000476837158, "reward_std": 0.22097086533904076, "rewards/argmax_reward_func": 0.625, "rewards/format_reward_func": 0.20000000298023224, "step": 95 }, { "completion_length": 1505.546875, "epoch": 0.1994287198130356, "grad_norm": 0.05075477808713913, "kl": 0.39650479704141617, "learning_rate": 9.783359578485047e-05, "loss": 0.0002, "reward": 0.8855469226837158, "reward_std": 0.17953883111476898, "rewards/argmax_reward_func": 0.6875, "rewards/format_reward_func": 0.19804687798023224, "step": 96 }, { "completion_length": 1542.90625, "epoch": 0.20150610231108804, "grad_norm": 0.053789589554071426, "kl": 0.35163769498467445, "learning_rate": 9.778554651961159e-05, "loss": 0.0002, "reward": 0.8562500476837158, "reward_std": 0.22097086533904076, "rewards/argmax_reward_func": 0.65625, "rewards/format_reward_func": 0.20000000298023224, "step": 97 }, { "completion_length": 1533.46875, "epoch": 0.20358348480914049, "grad_norm": 0.05969106778502464, "kl": 0.40055200457572937, "learning_rate": 9.773698229610263e-05, "loss": 0.0002, "reward": 0.8664062917232513, "reward_std": 0.29499610885977745, "rewards/argmax_reward_func": 0.671875, "rewards/format_reward_func": 0.1945312526077032, "step": 98 }, { "completion_length": 1658.3125, "epoch": 0.20566086730719293, "grad_norm": 0.05904076248407364, "kl": 0.3737713471055031, "learning_rate": 9.768790363767322e-05, "loss": 0.0002, "reward": 0.7132812924683094, "reward_std": 0.2905766908079386, "rewards/argmax_reward_func": 0.515625, "rewards/format_reward_func": 0.19765625149011612, "step": 99 }, { "completion_length": 1522.9375, "epoch": 0.20773824980524538, "grad_norm": 0.04626452177762985, "kl": 0.3718419596552849, "learning_rate": 9.763831107321678e-05, "loss": 0.0002, "reward": 0.6843750439584255, "reward_std": 0.19887377694249153, "rewards/argmax_reward_func": 0.484375, "rewards/format_reward_func": 0.20000000298023224, "step": 100 }, { "completion_length": 1437.34375, "epoch": 0.20981563230329783, "grad_norm": 0.0583551786839962, "kl": 0.3823527656495571, "learning_rate": 9.75882051371648e-05, "loss": 0.0002, "reward": 0.7929687947034836, "reward_std": 0.26626989617943764, "rewards/argmax_reward_func": 0.59375, "rewards/format_reward_func": 0.19921875186264515, "step": 101 }, { "completion_length": 1665.859375, "epoch": 0.2118930148013503, "grad_norm": 0.071258544921875, "kl": 0.3524062894284725, "learning_rate": 9.753758636948111e-05, "loss": 0.0002, "reward": 0.7121094167232513, "reward_std": 0.3806223217397928, "rewards/argmax_reward_func": 0.515625, "rewards/format_reward_func": 0.19648437574505806, "step": 102 }, { "completion_length": 1570.484375, "epoch": 0.21397039729940276, "grad_norm": 0.06221286952495575, "kl": 0.4012618362903595, "learning_rate": 9.748645531565604e-05, "loss": 0.0002, "reward": 0.8691406697034836, "reward_std": 0.2911291141062975, "rewards/argmax_reward_func": 0.671875, "rewards/format_reward_func": 0.19726562686264515, "step": 103 }, { "completion_length": 1470.140625, "epoch": 0.2160477797974552, "grad_norm": 0.05706779286265373, "kl": 0.37375468015670776, "learning_rate": 9.743481252670049e-05, "loss": 0.0002, "reward": 0.7136719189584255, "reward_std": 0.24583008396439254, "rewards/argmax_reward_func": 0.515625, "rewards/format_reward_func": 0.19804687798023224, "step": 104 }, { "completion_length": 1487.796875, "epoch": 0.21812516229550766, "grad_norm": 0.04427757114171982, "kl": 0.40517764165997505, "learning_rate": 9.738265855914013e-05, "loss": 0.0002, "reward": 0.7468750476837158, "reward_std": 0.15467960573732853, "rewards/argmax_reward_func": 0.546875, "rewards/format_reward_func": 0.20000000298023224, "step": 105 }, { "completion_length": 1542.703125, "epoch": 0.2202025447935601, "grad_norm": 0.060884129256010056, "kl": 0.41977495700120926, "learning_rate": 9.732999397500926e-05, "loss": 0.0002, "reward": 0.6503906659781933, "reward_std": 0.24693494127131999, "rewards/argmax_reward_func": 0.453125, "rewards/format_reward_func": 0.19726562686264515, "step": 106 }, { "completion_length": 1549.03125, "epoch": 0.22227992729161256, "grad_norm": 0.04595618322491646, "kl": 0.47253532335162163, "learning_rate": 9.727681934184481e-05, "loss": 0.0002, "reward": 0.9000000506639481, "reward_std": 0.1590990237891674, "rewards/argmax_reward_func": 0.703125, "rewards/format_reward_func": 0.19687500223517418, "step": 107 }, { "completion_length": 1636.546875, "epoch": 0.22435730978966503, "grad_norm": 0.03207004442811012, "kl": 0.37253231182694435, "learning_rate": 9.722313523268028e-05, "loss": 0.0002, "reward": 0.8875000439584255, "reward_std": 0.0883883461356163, "rewards/argmax_reward_func": 0.6875, "rewards/format_reward_func": 0.20000000298023224, "step": 108 }, { "completion_length": 1696.84375, "epoch": 0.22643469228771748, "grad_norm": 0.08920740336179733, "kl": 0.7168225161731243, "learning_rate": 9.716894222603942e-05, "loss": 0.0004, "reward": 0.8093750476837158, "reward_std": 0.19887377694249153, "rewards/argmax_reward_func": 0.609375, "rewards/format_reward_func": 0.20000000298023224, "step": 109 }, { "completion_length": 1443.65625, "epoch": 0.22851207478576993, "grad_norm": 0.06247260421514511, "kl": 0.3850158527493477, "learning_rate": 9.711424090593019e-05, "loss": 0.0002, "reward": 0.7617187947034836, "reward_std": 0.2662698905915022, "rewards/argmax_reward_func": 0.5625, "rewards/format_reward_func": 0.1992187537252903, "step": 110 }, { "completion_length": 1509.8125, "epoch": 0.23058945728382238, "grad_norm": 0.06556280702352524, "kl": 0.3532305136322975, "learning_rate": 9.705903186183828e-05, "loss": 0.0002, "reward": 0.7281250506639481, "reward_std": 0.3137786276638508, "rewards/argmax_reward_func": 0.53125, "rewards/format_reward_func": 0.19687500223517418, "step": 111 }, { "completion_length": 1467.875, "epoch": 0.23266683978187483, "grad_norm": 0.06337332725524902, "kl": 0.3515300862491131, "learning_rate": 9.700331568872086e-05, "loss": 0.0002, "reward": 0.8054687976837158, "reward_std": 0.2905766889452934, "rewards/argmax_reward_func": 0.609375, "rewards/format_reward_func": 0.19609375298023224, "step": 112 }, { "completion_length": 1409.40625, "epoch": 0.23474422227992728, "grad_norm": 0.06349179893732071, "kl": 0.35185598209500313, "learning_rate": 9.694709298700016e-05, "loss": 0.0002, "reward": 0.7750000469386578, "reward_std": 0.24748736945912242, "rewards/argmax_reward_func": 0.578125, "rewards/format_reward_func": 0.19687500223517418, "step": 113 }, { "completion_length": 1412.53125, "epoch": 0.23682160477797976, "grad_norm": 0.064293272793293, "kl": 0.506983544677496, "learning_rate": 9.689036436255699e-05, "loss": 0.0003, "reward": 0.724609412252903, "reward_std": 0.22483785497024655, "rewards/argmax_reward_func": 0.53125, "rewards/format_reward_func": 0.19335937686264515, "step": 114 }, { "completion_length": 1372.859375, "epoch": 0.2388989872760322, "grad_norm": 0.06638536602258682, "kl": 0.35727328434586525, "learning_rate": 9.683313042672418e-05, "loss": 0.0002, "reward": 0.7781250439584255, "reward_std": 0.287262124940753, "rewards/argmax_reward_func": 0.578125, "rewards/format_reward_func": 0.20000000298023224, "step": 115 }, { "completion_length": 1373.09375, "epoch": 0.24097636977408465, "grad_norm": 0.06149492412805557, "kl": 0.3754408285021782, "learning_rate": 9.677539179628005e-05, "loss": 0.0002, "reward": 0.8406250476837158, "reward_std": 0.24306795187294483, "rewards/argmax_reward_func": 0.640625, "rewards/format_reward_func": 0.20000000298023224, "step": 116 }, { "completion_length": 1313.734375, "epoch": 0.2430537522721371, "grad_norm": 0.062166426330804825, "kl": 0.40280015021562576, "learning_rate": 9.671714909344174e-05, "loss": 0.0002, "reward": 0.8531250506639481, "reward_std": 0.2695844564586878, "rewards/argmax_reward_func": 0.65625, "rewards/format_reward_func": 0.19687500223517418, "step": 117 }, { "completion_length": 1132.78125, "epoch": 0.24513113477018955, "grad_norm": 0.06206024810671806, "kl": 0.4296950623393059, "learning_rate": 9.665840294585845e-05, "loss": 0.0002, "reward": 0.7625000439584255, "reward_std": 0.22097086533904076, "rewards/argmax_reward_func": 0.5625, "rewards/format_reward_func": 0.20000000298023224, "step": 118 }, { "completion_length": 1270.625, "epoch": 0.24720851726824203, "grad_norm": 0.05055106431245804, "kl": 0.3436691351234913, "learning_rate": 9.659915398660477e-05, "loss": 0.0002, "reward": 0.7742187902331352, "reward_std": 0.16020388156175613, "rewards/argmax_reward_func": 0.578125, "rewards/format_reward_func": 0.19609375298023224, "step": 119 }, { "completion_length": 1221.015625, "epoch": 0.24928589976629448, "grad_norm": 0.06833141297101974, "kl": 0.3341045156121254, "learning_rate": 9.65394028541738e-05, "loss": 0.0002, "reward": 0.7937500439584255, "reward_std": 0.2651650384068489, "rewards/argmax_reward_func": 0.59375, "rewards/format_reward_func": 0.20000000298023224, "step": 120 }, { "completion_length": 1273.125, "epoch": 0.2513632822643469, "grad_norm": 0.06194274127483368, "kl": 0.3503304682672024, "learning_rate": 9.647915019247029e-05, "loss": 0.0002, "reward": 0.6687500439584255, "reward_std": 0.22097086533904076, "rewards/argmax_reward_func": 0.46875, "rewards/format_reward_func": 0.20000000298023224, "step": 121 }, { "completion_length": 1169.4375, "epoch": 0.2534406647623994, "grad_norm": 0.05682160705327988, "kl": 0.4410577192902565, "learning_rate": 9.641839665080363e-05, "loss": 0.0002, "reward": 0.7781250476837158, "reward_std": 0.19887377694249153, "rewards/argmax_reward_func": 0.578125, "rewards/format_reward_func": 0.20000000298023224, "step": 122 }, { "completion_length": 1201.921875, "epoch": 0.25551804726045185, "grad_norm": 0.061100929975509644, "kl": 0.3569498844444752, "learning_rate": 9.635714288388102e-05, "loss": 0.0002, "reward": 0.9031250476837158, "reward_std": 0.19887377880513668, "rewards/argmax_reward_func": 0.703125, "rewards/format_reward_func": 0.20000000298023224, "step": 123 }, { "completion_length": 1296.234375, "epoch": 0.2575954297585043, "grad_norm": 0.0515943244099617, "kl": 0.34240079671144485, "learning_rate": 9.629538955180021e-05, "loss": 0.0002, "reward": 0.6835937909781933, "reward_std": 0.15578446350991726, "rewards/argmax_reward_func": 0.484375, "rewards/format_reward_func": 0.19921875186264515, "step": 124 }, { "completion_length": 1237.078125, "epoch": 0.25967281225655675, "grad_norm": 0.07088616490364075, "kl": 0.34578079730272293, "learning_rate": 9.623313732004258e-05, "loss": 0.0002, "reward": 0.6687500402331352, "reward_std": 0.26516503654420376, "rewards/argmax_reward_func": 0.46875, "rewards/format_reward_func": 0.20000000298023224, "step": 125 }, { "completion_length": 1213.71875, "epoch": 0.2617501947546092, "grad_norm": 0.05374123901128769, "kl": 0.3377624601125717, "learning_rate": 9.617038685946578e-05, "loss": 0.0002, "reward": 0.7468750402331352, "reward_std": 0.19887377880513668, "rewards/argmax_reward_func": 0.546875, "rewards/format_reward_func": 0.20000000298023224, "step": 126 }, { "completion_length": 1325.421875, "epoch": 0.26382757725266165, "grad_norm": 0.05408313870429993, "kl": 0.3581954091787338, "learning_rate": 9.610713884629666e-05, "loss": 0.0002, "reward": 0.8562500476837158, "reward_std": 0.22097086533904076, "rewards/argmax_reward_func": 0.65625, "rewards/format_reward_func": 0.20000000298023224, "step": 127 }, { "completion_length": 1305.421875, "epoch": 0.2659049597507141, "grad_norm": 0.05651029199361801, "kl": 0.3299425356090069, "learning_rate": 9.60433939621239e-05, "loss": 0.0002, "reward": 0.6500000394880772, "reward_std": 0.15909902285784483, "rewards/argmax_reward_func": 0.453125, "rewards/format_reward_func": 0.19687500223517418, "step": 128 }, { "completion_length": 1394.34375, "epoch": 0.26798234224876655, "grad_norm": 0.05847406014800072, "kl": 0.3248457871377468, "learning_rate": 9.597915289389066e-05, "loss": 0.0002, "reward": 0.8847656697034836, "reward_std": 0.22483785450458527, "rewards/argmax_reward_func": 0.6875, "rewards/format_reward_func": 0.19726562686264515, "step": 129 }, { "completion_length": 1361.125, "epoch": 0.270059724746819, "grad_norm": 0.03918185085058212, "kl": 0.29694442078471184, "learning_rate": 9.591441633388724e-05, "loss": 0.0001, "reward": 0.8687500506639481, "reward_std": 0.11490485025569797, "rewards/argmax_reward_func": 0.671875, "rewards/format_reward_func": 0.19687500223517418, "step": 130 }, { "completion_length": 1294.34375, "epoch": 0.27213710724487145, "grad_norm": 0.06627894192934036, "kl": 0.317622110247612, "learning_rate": 9.584918497974354e-05, "loss": 0.0002, "reward": 0.8031250387430191, "reward_std": 0.2519067842513323, "rewards/argmax_reward_func": 0.609375, "rewards/format_reward_func": 0.19375000149011612, "step": 131 }, { "completion_length": 1264.578125, "epoch": 0.2742144897429239, "grad_norm": 0.05716657266020775, "kl": 0.33274614438414574, "learning_rate": 9.578345953442162e-05, "loss": 0.0002, "reward": 0.7093750424683094, "reward_std": 0.24306795187294483, "rewards/argmax_reward_func": 0.515625, "rewards/format_reward_func": 0.19375000335276127, "step": 132 }, { "completion_length": 1101.5625, "epoch": 0.27629187224097634, "grad_norm": 0.06597350537776947, "kl": 0.3318898268043995, "learning_rate": 9.571724070620806e-05, "loss": 0.0002, "reward": 0.8562500476837158, "reward_std": 0.2209708634763956, "rewards/argmax_reward_func": 0.65625, "rewards/format_reward_func": 0.20000000298023224, "step": 133 }, { "completion_length": 1340.328125, "epoch": 0.27836925473902885, "grad_norm": 0.06743122637271881, "kl": 0.2939135618507862, "learning_rate": 9.565052920870636e-05, "loss": 0.0001, "reward": 0.6312500461935997, "reward_std": 0.27400387404486537, "rewards/argmax_reward_func": 0.4375, "rewards/format_reward_func": 0.19375000149011612, "step": 134 }, { "completion_length": 1385.546875, "epoch": 0.2804466372370813, "grad_norm": 0.05118987336754799, "kl": 0.27961407601833344, "learning_rate": 9.558332576082925e-05, "loss": 0.0001, "reward": 0.8664062991738319, "reward_std": 0.20660776272416115, "rewards/argmax_reward_func": 0.671875, "rewards/format_reward_func": 0.19453125074505806, "step": 135 }, { "completion_length": 1284.90625, "epoch": 0.28252401973513375, "grad_norm": 0.060963716357946396, "kl": 0.310220867395401, "learning_rate": 9.551563108679091e-05, "loss": 0.0002, "reward": 0.8875000476837158, "reward_std": 0.2209708634763956, "rewards/argmax_reward_func": 0.6875, "rewards/format_reward_func": 0.20000000298023224, "step": 136 }, { "completion_length": 1260.078125, "epoch": 0.2846014022331862, "grad_norm": 0.0460037924349308, "kl": 0.39314381033182144, "learning_rate": 9.544744591609922e-05, "loss": 0.0002, "reward": 0.7781250402331352, "reward_std": 0.15467960573732853, "rewards/argmax_reward_func": 0.578125, "rewards/format_reward_func": 0.20000000298023224, "step": 137 }, { "completion_length": 1031.03125, "epoch": 0.28667878473123865, "grad_norm": 0.06592284142971039, "kl": 0.4330439232289791, "learning_rate": 9.537877098354786e-05, "loss": 0.0002, "reward": 0.9343750476837158, "reward_std": 0.19887377694249153, "rewards/argmax_reward_func": 0.734375, "rewards/format_reward_func": 0.20000000298023224, "step": 138 }, { "completion_length": 1101.796875, "epoch": 0.2887561672292911, "grad_norm": 0.0644359141588211, "kl": 0.2887462917715311, "learning_rate": 9.53096070292084e-05, "loss": 0.0001, "reward": 0.8218750432133675, "reward_std": 0.22539028152823448, "rewards/argmax_reward_func": 0.625, "rewards/format_reward_func": 0.19687500223517418, "step": 139 }, { "completion_length": 1065.90625, "epoch": 0.29083354972734354, "grad_norm": 0.065298892557621, "kl": 0.30470659770071507, "learning_rate": 9.523995479842232e-05, "loss": 0.0002, "reward": 0.6218750365078449, "reward_std": 0.24306795001029968, "rewards/argmax_reward_func": 0.421875, "rewards/format_reward_func": 0.20000000298023224, "step": 140 }, { "completion_length": 978.953125, "epoch": 0.292910932225396, "grad_norm": 0.05792571231722832, "kl": 0.4863986298441887, "learning_rate": 9.516981504179299e-05, "loss": 0.0002, "reward": 0.8718750476837158, "reward_std": 0.15467960573732853, "rewards/argmax_reward_func": 0.671875, "rewards/format_reward_func": 0.20000000298023224, "step": 141 }, { "completion_length": 1087.9375, "epoch": 0.29498831472344844, "grad_norm": 0.06688184291124344, "kl": 0.29886077158153057, "learning_rate": 9.509918851517758e-05, "loss": 0.0001, "reward": 0.8562500476837158, "reward_std": 0.2651650384068489, "rewards/argmax_reward_func": 0.65625, "rewards/format_reward_func": 0.20000000298023224, "step": 142 }, { "completion_length": 1019.140625, "epoch": 0.2970656972215009, "grad_norm": 0.06881757080554962, "kl": 0.3445068225264549, "learning_rate": 9.502807597967893e-05, "loss": 0.0002, "reward": 0.8718750476837158, "reward_std": 0.24306795001029968, "rewards/argmax_reward_func": 0.671875, "rewards/format_reward_func": 0.20000000298023224, "step": 143 }, { "completion_length": 1186.5625, "epoch": 0.29914307971955334, "grad_norm": 0.05837235972285271, "kl": 0.32853276655077934, "learning_rate": 9.495647820163725e-05, "loss": 0.0002, "reward": 0.8855469226837158, "reward_std": 0.17953882738947868, "rewards/argmax_reward_func": 0.6875, "rewards/format_reward_func": 0.19804687798023224, "step": 144 }, { "completion_length": 931.265625, "epoch": 0.30122046221760584, "grad_norm": 0.046699460595846176, "kl": 0.33741075173020363, "learning_rate": 9.488439595262204e-05, "loss": 0.0002, "reward": 0.8718750476837158, "reward_std": 0.11048543266952038, "rewards/argmax_reward_func": 0.671875, "rewards/format_reward_func": 0.20000000298023224, "step": 145 }, { "completion_length": 944.25, "epoch": 0.3032978447156583, "grad_norm": 0.06217503920197487, "kl": 0.3317374251782894, "learning_rate": 9.48118300094236e-05, "loss": 0.0002, "reward": 0.7312500476837158, "reward_std": 0.17677669040858746, "rewards/argmax_reward_func": 0.53125, "rewards/format_reward_func": 0.20000000298023224, "step": 146 }, { "completion_length": 1001.078125, "epoch": 0.30537522721371074, "grad_norm": 0.06545262783765793, "kl": 0.3109145648777485, "learning_rate": 9.473878115404477e-05, "loss": 0.0002, "reward": 0.9031250476837158, "reward_std": 0.19887377694249153, "rewards/argmax_reward_func": 0.703125, "rewards/format_reward_func": 0.20000000298023224, "step": 147 }, { "completion_length": 910.796875, "epoch": 0.3074526097117632, "grad_norm": 0.05757139250636101, "kl": 0.3004848547279835, "learning_rate": 9.466525017369243e-05, "loss": 0.0002, "reward": 0.9187500476837158, "reward_std": 0.17677669040858746, "rewards/argmax_reward_func": 0.71875, "rewards/format_reward_func": 0.20000000298023224, "step": 148 }, { "completion_length": 1078.265625, "epoch": 0.30952999220981564, "grad_norm": 0.07616781443357468, "kl": 0.28462448343634605, "learning_rate": 9.459123786076912e-05, "loss": 0.0001, "reward": 0.8093750476837158, "reward_std": 0.33145629428327084, "rewards/argmax_reward_func": 0.609375, "rewards/format_reward_func": 0.20000000298023224, "step": 149 }, { "completion_length": 963.71875, "epoch": 0.3116073747078681, "grad_norm": 0.06607849150896072, "kl": 0.2945715934038162, "learning_rate": 9.451674501286436e-05, "loss": 0.0001, "reward": 0.7468750402331352, "reward_std": 0.24306795001029968, "rewards/argmax_reward_func": 0.546875, "rewards/format_reward_func": 0.20000000298023224, "step": 150 }, { "completion_length": 925.484375, "epoch": 0.31368475720592054, "grad_norm": 0.08415860682725906, "kl": 0.326167568564415, "learning_rate": 9.444177243274618e-05, "loss": 0.0002, "reward": 0.7000000439584255, "reward_std": 0.35355337895452976, "rewards/argmax_reward_func": 0.5, "rewards/format_reward_func": 0.20000000298023224, "step": 151 }, { "completion_length": 814.296875, "epoch": 0.315762139703973, "grad_norm": 0.049171049147844315, "kl": 0.312137458473444, "learning_rate": 9.436632092835239e-05, "loss": 0.0002, "reward": 1.0281250476837158, "reward_std": 0.11048543266952038, "rewards/argmax_reward_func": 0.828125, "rewards/format_reward_func": 0.20000000298023224, "step": 152 }, { "completion_length": 783.484375, "epoch": 0.31783952220202544, "grad_norm": 0.06367822736501694, "kl": 0.33039499446749687, "learning_rate": 9.42903913127819e-05, "loss": 0.0002, "reward": 0.7281250394880772, "reward_std": 0.18119611032307148, "rewards/argmax_reward_func": 0.53125, "rewards/format_reward_func": 0.19687500223517418, "step": 153 }, { "completion_length": 861.609375, "epoch": 0.3199169047000779, "grad_norm": 0.06421905755996704, "kl": 0.3065376691520214, "learning_rate": 9.421398440428597e-05, "loss": 0.0002, "reward": 0.7625000439584255, "reward_std": 0.2209708634763956, "rewards/argmax_reward_func": 0.5625, "rewards/format_reward_func": 0.20000000298023224, "step": 154 }, { "completion_length": 902.390625, "epoch": 0.32199428719813034, "grad_norm": 0.07363509386777878, "kl": 0.33688198402523994, "learning_rate": 9.413710102625938e-05, "loss": 0.0002, "reward": 0.7468750439584255, "reward_std": 0.287262124940753, "rewards/argmax_reward_func": 0.546875, "rewards/format_reward_func": 0.20000000298023224, "step": 155 }, { "completion_length": 922.0625, "epoch": 0.3240716696961828, "grad_norm": 0.06810685992240906, "kl": 0.34481339529156685, "learning_rate": 9.405974200723155e-05, "loss": 0.0002, "reward": 0.7937500476837158, "reward_std": 0.26516503654420376, "rewards/argmax_reward_func": 0.59375, "rewards/format_reward_func": 0.20000000298023224, "step": 156 }, { "completion_length": 884.640625, "epoch": 0.3261490521942353, "grad_norm": 0.06919455528259277, "kl": 0.3362896367907524, "learning_rate": 9.398190818085763e-05, "loss": 0.0002, "reward": 0.8398437947034836, "reward_std": 0.2441728077828884, "rewards/argmax_reward_func": 0.640625, "rewards/format_reward_func": 0.19921875186264515, "step": 157 }, { "completion_length": 831.109375, "epoch": 0.32822643469228774, "grad_norm": 0.08263985067605972, "kl": 0.882828488945961, "learning_rate": 9.390360038590951e-05, "loss": 0.0004, "reward": 0.8531250506639481, "reward_std": 0.22539028339087963, "rewards/argmax_reward_func": 0.65625, "rewards/format_reward_func": 0.19687500223517418, "step": 158 }, { "completion_length": 879.40625, "epoch": 0.3303038171903402, "grad_norm": 0.0637197494506836, "kl": 0.31912703067064285, "learning_rate": 9.382481946626674e-05, "loss": 0.0002, "reward": 0.7625000439584255, "reward_std": 0.2209708634763956, "rewards/argmax_reward_func": 0.5625, "rewards/format_reward_func": 0.20000000298023224, "step": 159 }, { "completion_length": 696.3125, "epoch": 0.33238119968839264, "grad_norm": 0.08041277527809143, "kl": 0.3748646304011345, "learning_rate": 9.374556627090749e-05, "loss": 0.0002, "reward": 0.8562500476837158, "reward_std": 0.2651650384068489, "rewards/argmax_reward_func": 0.65625, "rewards/format_reward_func": 0.20000000298023224, "step": 160 }, { "completion_length": 891.265625, "epoch": 0.3344585821864451, "grad_norm": 0.06974095106124878, "kl": 0.3626530338078737, "learning_rate": 9.366584165389941e-05, "loss": 0.0002, "reward": 0.8406250439584255, "reward_std": 0.24306794814765453, "rewards/argmax_reward_func": 0.640625, "rewards/format_reward_func": 0.20000000298023224, "step": 161 }, { "completion_length": 813.03125, "epoch": 0.33653596468449753, "grad_norm": 0.092588409781456, "kl": 0.3759094402194023, "learning_rate": 9.358564647439037e-05, "loss": 0.0002, "reward": 0.7593750506639481, "reward_std": 0.35797279700636864, "rewards/argmax_reward_func": 0.5625, "rewards/format_reward_func": 0.19687500223517418, "step": 162 }, { "completion_length": 838.296875, "epoch": 0.33861334718255, "grad_norm": 0.06730964034795761, "kl": 0.3410007916390896, "learning_rate": 9.350498159659924e-05, "loss": 0.0002, "reward": 0.8406250439584255, "reward_std": 0.19887377880513668, "rewards/argmax_reward_func": 0.640625, "rewards/format_reward_func": 0.20000000298023224, "step": 163 }, { "completion_length": 775.375, "epoch": 0.34069072968060243, "grad_norm": 0.06589485704898834, "kl": 0.3439077027142048, "learning_rate": 9.342384788980656e-05, "loss": 0.0002, "reward": 0.7312500420957804, "reward_std": 0.2209708634763956, "rewards/argmax_reward_func": 0.53125, "rewards/format_reward_func": 0.20000000298023224, "step": 164 }, { "completion_length": 781.0, "epoch": 0.3427681121786549, "grad_norm": 0.07955412566661835, "kl": 0.359022606164217, "learning_rate": 9.33422462283452e-05, "loss": 0.0002, "reward": 0.7468750402331352, "reward_std": 0.287262124940753, "rewards/argmax_reward_func": 0.546875, "rewards/format_reward_func": 0.20000000298023224, "step": 165 }, { "completion_length": 895.96875, "epoch": 0.34484549467670733, "grad_norm": 0.06293340772390366, "kl": 0.4317344203591347, "learning_rate": 9.326017749159087e-05, "loss": 0.0002, "reward": 0.7625000439584255, "reward_std": 0.1767766922712326, "rewards/argmax_reward_func": 0.5625, "rewards/format_reward_func": 0.20000000298023224, "step": 166 }, { "completion_length": 884.25, "epoch": 0.3469228771747598, "grad_norm": 0.07700355350971222, "kl": 0.5744567923247814, "learning_rate": 9.317764256395275e-05, "loss": 0.0003, "reward": 0.6031250357627869, "reward_std": 0.26958445459604263, "rewards/argmax_reward_func": 0.40625, "rewards/format_reward_func": 0.19687500409781933, "step": 167 }, { "completion_length": 889.921875, "epoch": 0.34900025967281223, "grad_norm": 0.062210842967033386, "kl": 0.32679086178541183, "learning_rate": 9.309464233486387e-05, "loss": 0.0002, "reward": 0.7468750402331352, "reward_std": 0.19887377507984638, "rewards/argmax_reward_func": 0.546875, "rewards/format_reward_func": 0.20000000298023224, "step": 168 }, { "completion_length": 788.84375, "epoch": 0.35107764217086473, "grad_norm": 0.0710466280579567, "kl": 0.35585347935557365, "learning_rate": 9.301117769877153e-05, "loss": 0.0002, "reward": 0.8187500350177288, "reward_std": 0.22207572124898434, "rewards/argmax_reward_func": 0.625, "rewards/format_reward_func": 0.19375000149011612, "step": 169 }, { "completion_length": 849.484375, "epoch": 0.3531550246689172, "grad_norm": 0.0648435726761818, "kl": 0.32205165177583694, "learning_rate": 9.292724955512774e-05, "loss": 0.0002, "reward": 0.8406250476837158, "reward_std": 0.24306795187294483, "rewards/argmax_reward_func": 0.640625, "rewards/format_reward_func": 0.20000000298023224, "step": 170 }, { "completion_length": 856.984375, "epoch": 0.35523240716696963, "grad_norm": 0.06077580899000168, "kl": 0.34612051025032997, "learning_rate": 9.284285880837946e-05, "loss": 0.0002, "reward": 0.7156250365078449, "reward_std": 0.19887377880513668, "rewards/argmax_reward_func": 0.515625, "rewards/format_reward_func": 0.20000000298023224, "step": 171 }, { "completion_length": 776.5, "epoch": 0.3573097896650221, "grad_norm": 0.07481009513139725, "kl": 0.3612271770834923, "learning_rate": 9.275800636795884e-05, "loss": 0.0002, "reward": 0.7773437909781933, "reward_std": 0.28836698085069656, "rewards/argmax_reward_func": 0.578125, "rewards/format_reward_func": 0.1992187537252903, "step": 172 }, { "completion_length": 789.046875, "epoch": 0.35938717216307453, "grad_norm": 0.07435107976198196, "kl": 0.3340052030980587, "learning_rate": 9.267269314827345e-05, "loss": 0.0002, "reward": 0.8398437947034836, "reward_std": 0.28836698085069656, "rewards/argmax_reward_func": 0.640625, "rewards/format_reward_func": 0.19921875186264515, "step": 173 }, { "completion_length": 768.390625, "epoch": 0.361464554661127, "grad_norm": 0.08201409131288528, "kl": 0.3197612836956978, "learning_rate": 9.258692006869643e-05, "loss": 0.0002, "reward": 0.621093787252903, "reward_std": 0.3325611485633999, "rewards/argmax_reward_func": 0.421875, "rewards/format_reward_func": 0.1992187537252903, "step": 174 }, { "completion_length": 740.515625, "epoch": 0.36354193715917943, "grad_norm": 0.08215157687664032, "kl": 0.3205004744231701, "learning_rate": 9.250068805355658e-05, "loss": 0.0002, "reward": 0.7781250476837158, "reward_std": 0.2872621212154627, "rewards/argmax_reward_func": 0.578125, "rewards/format_reward_func": 0.20000000298023224, "step": 175 }, { "completion_length": 833.796875, "epoch": 0.3656193196572319, "grad_norm": 0.06702969968318939, "kl": 0.31005076318979263, "learning_rate": 9.24139980321284e-05, "loss": 0.0002, "reward": 0.7468750439584255, "reward_std": 0.28726212307810783, "rewards/argmax_reward_func": 0.546875, "rewards/format_reward_func": 0.20000000298023224, "step": 176 }, { "completion_length": 790.546875, "epoch": 0.3676967021552843, "grad_norm": 0.08229520171880722, "kl": 0.32232359051704407, "learning_rate": 9.232685093862204e-05, "loss": 0.0002, "reward": 0.7468750439584255, "reward_std": 0.33145629428327084, "rewards/argmax_reward_func": 0.546875, "rewards/format_reward_func": 0.20000000298023224, "step": 177 }, { "completion_length": 814.953125, "epoch": 0.3697740846533368, "grad_norm": 0.0782497227191925, "kl": 0.3153250627219677, "learning_rate": 9.22392477121733e-05, "loss": 0.0002, "reward": 0.7750000506639481, "reward_std": 0.3358757123351097, "rewards/argmax_reward_func": 0.578125, "rewards/format_reward_func": 0.19687500223517418, "step": 178 }, { "completion_length": 972.390625, "epoch": 0.3718514671513892, "grad_norm": 0.07078168541193008, "kl": 0.3324251137673855, "learning_rate": 9.215118929683344e-05, "loss": 0.0002, "reward": 0.7750000469386578, "reward_std": 0.29168154671788216, "rewards/argmax_reward_func": 0.578125, "rewards/format_reward_func": 0.19687500223517418, "step": 179 }, { "completion_length": 757.90625, "epoch": 0.37392884964944173, "grad_norm": 0.09468799084424973, "kl": 0.33874499425292015, "learning_rate": 9.206267664155907e-05, "loss": 0.0002, "reward": 0.7781250476837158, "reward_std": 0.4640388172119856, "rewards/argmax_reward_func": 0.578125, "rewards/format_reward_func": 0.20000000298023224, "step": 180 }, { "completion_length": 844.265625, "epoch": 0.3760062321474942, "grad_norm": 0.08337994664907455, "kl": 0.31716278567910194, "learning_rate": 9.197371070020184e-05, "loss": 0.0002, "reward": 0.7906250506639481, "reward_std": 0.3579728025943041, "rewards/argmax_reward_func": 0.59375, "rewards/format_reward_func": 0.19687500223517418, "step": 181 }, { "completion_length": 864.234375, "epoch": 0.37808361464554663, "grad_norm": 0.0694384053349495, "kl": 0.32097451388835907, "learning_rate": 9.188429243149824e-05, "loss": 0.0002, "reward": 0.6472656652331352, "reward_std": 0.24362037517130375, "rewards/argmax_reward_func": 0.453125, "rewards/format_reward_func": 0.19414062798023224, "step": 182 }, { "completion_length": 749.046875, "epoch": 0.3801609971435991, "grad_norm": 0.062498513609170914, "kl": 0.3336629420518875, "learning_rate": 9.179442279905928e-05, "loss": 0.0002, "reward": 0.9031250476837158, "reward_std": 0.19887377880513668, "rewards/argmax_reward_func": 0.703125, "rewards/format_reward_func": 0.20000000298023224, "step": 183 }, { "completion_length": 923.640625, "epoch": 0.3822383796416515, "grad_norm": 0.07083828747272491, "kl": 0.32143479958176613, "learning_rate": 9.170410277135999e-05, "loss": 0.0002, "reward": 0.75625004991889, "reward_std": 0.31819804944097996, "rewards/argmax_reward_func": 0.5625, "rewards/format_reward_func": 0.19375000149011612, "step": 184 }, { "completion_length": 884.015625, "epoch": 0.384315762139704, "grad_norm": 0.0693785548210144, "kl": 0.4987417571246624, "learning_rate": 9.161333332172912e-05, "loss": 0.0002, "reward": 0.7062500454485416, "reward_std": 0.24748736806213856, "rewards/argmax_reward_func": 0.515625, "rewards/format_reward_func": 0.1906250026077032, "step": 185 }, { "completion_length": 795.234375, "epoch": 0.3863931446377564, "grad_norm": 0.07442086935043335, "kl": 0.33293722197413445, "learning_rate": 9.152211542833857e-05, "loss": 0.0002, "reward": 0.7625000439584255, "reward_std": 0.3535533845424652, "rewards/argmax_reward_func": 0.5625, "rewards/format_reward_func": 0.20000000298023224, "step": 186 }, { "completion_length": 929.25, "epoch": 0.3884705271358089, "grad_norm": 0.05656367912888527, "kl": 0.31210994347929955, "learning_rate": 9.143045007419284e-05, "loss": 0.0002, "reward": 0.7625000439584255, "reward_std": 0.22097086533904076, "rewards/argmax_reward_func": 0.5625, "rewards/format_reward_func": 0.20000000298023224, "step": 187 }, { "completion_length": 902.828125, "epoch": 0.3905479096338613, "grad_norm": 0.06982313841581345, "kl": 0.3056885749101639, "learning_rate": 9.133833824711853e-05, "loss": 0.0002, "reward": 0.7156250439584255, "reward_std": 0.28726212307810783, "rewards/argmax_reward_func": 0.515625, "rewards/format_reward_func": 0.20000000298023224, "step": 188 }, { "completion_length": 995.71875, "epoch": 0.3926252921319138, "grad_norm": 0.07361900061368942, "kl": 0.3039589188992977, "learning_rate": 9.124578093975358e-05, "loss": 0.0002, "reward": 0.7300781719386578, "reward_std": 0.35521066188812256, "rewards/argmax_reward_func": 0.53125, "rewards/format_reward_func": 0.19882812909781933, "step": 189 }, { "completion_length": 964.359375, "epoch": 0.3947026746299662, "grad_norm": 0.07159875333309174, "kl": 0.3399963229894638, "learning_rate": 9.115277914953662e-05, "loss": 0.0002, "reward": 0.7781250476837158, "reward_std": 0.33145629428327084, "rewards/argmax_reward_func": 0.578125, "rewards/format_reward_func": 0.20000000298023224, "step": 190 }, { "completion_length": 1034.03125, "epoch": 0.39678005712801867, "grad_norm": 0.07782501727342606, "kl": 0.3184865601360798, "learning_rate": 9.105933387869628e-05, "loss": 0.0002, "reward": 0.6910156607627869, "reward_std": 0.4104533866047859, "rewards/argmax_reward_func": 0.5, "rewards/format_reward_func": 0.19101562723517418, "step": 191 }, { "completion_length": 1139.8125, "epoch": 0.3988574396260712, "grad_norm": 0.05474551394581795, "kl": 0.29825419560074806, "learning_rate": 9.096544613424025e-05, "loss": 0.0001, "reward": 0.8804688006639481, "reward_std": 0.27510872669517994, "rewards/argmax_reward_func": 0.6875, "rewards/format_reward_func": 0.19296875223517418, "step": 192 }, { "completion_length": 1003.0, "epoch": 0.4009348221241236, "grad_norm": 0.0725637748837471, "kl": 0.3301442116498947, "learning_rate": 9.087111692794459e-05, "loss": 0.0002, "reward": 0.7304687947034836, "reward_std": 0.31046406365931034, "rewards/argmax_reward_func": 0.53125, "rewards/format_reward_func": 0.1992187537252903, "step": 193 }, { "completion_length": 1139.734375, "epoch": 0.4030122046221761, "grad_norm": 0.057006120681762695, "kl": 0.31723184883594513, "learning_rate": 9.077634727634272e-05, "loss": 0.0002, "reward": 0.8449219167232513, "reward_std": 0.23146697832271457, "rewards/argmax_reward_func": 0.65625, "rewards/format_reward_func": 0.1886718776077032, "step": 194 }, { "completion_length": 1048.71875, "epoch": 0.4050895871202285, "grad_norm": 0.07205278426408768, "kl": 0.33233997970819473, "learning_rate": 9.068113820071447e-05, "loss": 0.0002, "reward": 0.7875000387430191, "reward_std": 0.3181980513036251, "rewards/argmax_reward_func": 0.59375, "rewards/format_reward_func": 0.19375000335276127, "step": 195 }, { "completion_length": 955.6875, "epoch": 0.40716696961828097, "grad_norm": 0.057774197310209274, "kl": 0.3174768090248108, "learning_rate": 9.058549072707513e-05, "loss": 0.0002, "reward": 0.8347656726837158, "reward_std": 0.1994262058287859, "rewards/argmax_reward_func": 0.640625, "rewards/format_reward_func": 0.19414062798023224, "step": 196 }, { "completion_length": 1300.125, "epoch": 0.4092443521163334, "grad_norm": 0.05007508769631386, "kl": 0.30298993550240993, "learning_rate": 9.048940588616435e-05, "loss": 0.0002, "reward": 0.7843750491738319, "reward_std": 0.22539028525352478, "rewards/argmax_reward_func": 0.59375, "rewards/format_reward_func": 0.1906250026077032, "step": 197 }, { "completion_length": 1237.953125, "epoch": 0.41132173461438587, "grad_norm": 0.060646846890449524, "kl": 0.3001830168068409, "learning_rate": 9.039288471343504e-05, "loss": 0.0002, "reward": 0.8812500461935997, "reward_std": 0.27400387451052666, "rewards/argmax_reward_func": 0.6875, "rewards/format_reward_func": 0.19375000335276127, "step": 198 }, { "completion_length": 1186.453125, "epoch": 0.4133991171124383, "grad_norm": 0.05342816561460495, "kl": 0.30144498124718666, "learning_rate": 9.029592824904225e-05, "loss": 0.0002, "reward": 0.8074219226837158, "reward_std": 0.24583008885383606, "rewards/argmax_reward_func": 0.609375, "rewards/format_reward_func": 0.19804687798023224, "step": 199 }, { "completion_length": 1274.375, "epoch": 0.41547649961049077, "grad_norm": 0.05866052210330963, "kl": 0.3122952822595835, "learning_rate": 9.019853753783185e-05, "loss": 0.0002, "reward": 0.652343787252903, "reward_std": 0.2264951393008232, "rewards/argmax_reward_func": 0.46875, "rewards/format_reward_func": 0.1835937537252903, "step": 200 }, { "completion_length": 1241.5, "epoch": 0.4175538821085432, "grad_norm": 0.05399727076292038, "kl": 0.33987458795309067, "learning_rate": 9.010071362932944e-05, "loss": 0.0002, "reward": 0.8687500432133675, "reward_std": 0.2032931987196207, "rewards/argmax_reward_func": 0.671875, "rewards/format_reward_func": 0.19687500223517418, "step": 201 }, { "completion_length": 1336.140625, "epoch": 0.41963126460659567, "grad_norm": 0.06403433531522751, "kl": 0.28705168329179287, "learning_rate": 9.000245757772885e-05, "loss": 0.0001, "reward": 0.8281250521540642, "reward_std": 0.29610096476972103, "rewards/argmax_reward_func": 0.640625, "rewards/format_reward_func": 0.18750000558793545, "step": 202 }, { "completion_length": 1035.203125, "epoch": 0.42170864710464817, "grad_norm": 0.0628470629453659, "kl": 0.30243775993585587, "learning_rate": 8.990377044188098e-05, "loss": 0.0002, "reward": 0.85000004991889, "reward_std": 0.22980970283970237, "rewards/argmax_reward_func": 0.65625, "rewards/format_reward_func": 0.19375000149011612, "step": 203 }, { "completion_length": 1175.90625, "epoch": 0.4237860296027006, "grad_norm": 0.05064735934138298, "kl": 0.3158372975885868, "learning_rate": 8.980465328528219e-05, "loss": 0.0002, "reward": 0.7906250394880772, "reward_std": 0.18119611404836178, "rewards/argmax_reward_func": 0.59375, "rewards/format_reward_func": 0.19687500223517418, "step": 204 }, { "completion_length": 1109.28125, "epoch": 0.42586341210075307, "grad_norm": 0.060826126486063004, "kl": 0.2915416620671749, "learning_rate": 8.9705107176063e-05, "loss": 0.0001, "reward": 0.9437500461935997, "reward_std": 0.22980970703065395, "rewards/argmax_reward_func": 0.75, "rewards/format_reward_func": 0.19375000149011612, "step": 205 }, { "completion_length": 979.453125, "epoch": 0.4279407945988055, "grad_norm": 0.062372464686632156, "kl": 0.3590022251009941, "learning_rate": 8.960513318697647e-05, "loss": 0.0002, "reward": 0.8406250476837158, "reward_std": 0.24306795001029968, "rewards/argmax_reward_func": 0.640625, "rewards/format_reward_func": 0.20000000298023224, "step": 206 }, { "completion_length": 1014.40625, "epoch": 0.43001817709685797, "grad_norm": 0.0675223246216774, "kl": 0.3203696608543396, "learning_rate": 8.950473239538673e-05, "loss": 0.0002, "reward": 0.8835937976837158, "reward_std": 0.27068931609392166, "rewards/argmax_reward_func": 0.6875, "rewards/format_reward_func": 0.1960937511175871, "step": 207 }, { "completion_length": 1077.65625, "epoch": 0.4320955595949104, "grad_norm": 0.07710019499063492, "kl": 0.28732946887612343, "learning_rate": 8.940390588325727e-05, "loss": 0.0001, "reward": 0.8781250491738319, "reward_std": 0.4110058154910803, "rewards/argmax_reward_func": 0.6875, "rewards/format_reward_func": 0.19062500447034836, "step": 208 }, { "completion_length": 985.4375, "epoch": 0.43417294209296287, "grad_norm": 0.04350803792476654, "kl": 0.3246513232588768, "learning_rate": 8.930265473713938e-05, "loss": 0.0002, "reward": 0.8531250469386578, "reward_std": 0.13700193725526333, "rewards/argmax_reward_func": 0.65625, "rewards/format_reward_func": 0.19687500409781933, "step": 209 }, { "completion_length": 984.140625, "epoch": 0.4362503245910153, "grad_norm": 0.06984654814004898, "kl": 0.32982902973890305, "learning_rate": 8.920098004816036e-05, "loss": 0.0002, "reward": 0.8710937947034836, "reward_std": 0.24417280592024326, "rewards/argmax_reward_func": 0.671875, "rewards/format_reward_func": 0.19921875186264515, "step": 210 }, { "completion_length": 870.03125, "epoch": 0.43832770708906776, "grad_norm": 0.06809406727552414, "kl": 0.29028210788965225, "learning_rate": 8.909888291201182e-05, "loss": 0.0001, "reward": 0.8718750439584255, "reward_std": 0.24306795001029968, "rewards/argmax_reward_func": 0.671875, "rewards/format_reward_func": 0.20000000298023224, "step": 211 }, { "completion_length": 906.40625, "epoch": 0.4404050895871202, "grad_norm": 0.08602919429540634, "kl": 0.2871505431830883, "learning_rate": 8.899636442893783e-05, "loss": 0.0001, "reward": 0.8062500469386578, "reward_std": 0.3800698835402727, "rewards/argmax_reward_func": 0.609375, "rewards/format_reward_func": 0.19687500223517418, "step": 212 }, { "completion_length": 869.421875, "epoch": 0.44248247208517266, "grad_norm": 0.05844723433256149, "kl": 0.26848769187927246, "learning_rate": 8.88934257037231e-05, "loss": 0.0001, "reward": 0.8710937947034836, "reward_std": 0.19997863844037056, "rewards/argmax_reward_func": 0.671875, "rewards/format_reward_func": 0.1992187537252903, "step": 213 }, { "completion_length": 864.75, "epoch": 0.4445598545832251, "grad_norm": 0.07575644552707672, "kl": 0.4048551693558693, "learning_rate": 8.879006784568104e-05, "loss": 0.0002, "reward": 0.7613281682133675, "reward_std": 0.26682231947779655, "rewards/argmax_reward_func": 0.5625, "rewards/format_reward_func": 0.19882812723517418, "step": 214 }, { "completion_length": 877.96875, "epoch": 0.4466372370812776, "grad_norm": 0.07090688496828079, "kl": 0.2992668803781271, "learning_rate": 8.868629196864182e-05, "loss": 0.0001, "reward": 0.8035156689584255, "reward_std": 0.24362037889659405, "rewards/argmax_reward_func": 0.609375, "rewards/format_reward_func": 0.19414062798023224, "step": 215 }, { "completion_length": 779.8125, "epoch": 0.44871461957933007, "grad_norm": 0.069987952709198, "kl": 0.2883603498339653, "learning_rate": 8.858209919094039e-05, "loss": 0.0001, "reward": 0.7156250402331352, "reward_std": 0.24306795187294483, "rewards/argmax_reward_func": 0.515625, "rewards/format_reward_func": 0.20000000298023224, "step": 216 }, { "completion_length": 748.59375, "epoch": 0.4507920020773825, "grad_norm": 0.07478881627321243, "kl": 0.2929369006305933, "learning_rate": 8.847749063540439e-05, "loss": 0.0001, "reward": 0.8066406697034836, "reward_std": 0.24693494103848934, "rewards/argmax_reward_func": 0.609375, "rewards/format_reward_func": 0.19726562686264515, "step": 217 }, { "completion_length": 749.8125, "epoch": 0.45286938457543496, "grad_norm": 0.08688110113143921, "kl": 0.3713537007570267, "learning_rate": 8.837246742934207e-05, "loss": 0.0002, "reward": 0.7765625454485416, "reward_std": 0.33366601169109344, "rewards/argmax_reward_func": 0.578125, "rewards/format_reward_func": 0.1984375026077032, "step": 218 }, { "completion_length": 697.546875, "epoch": 0.4549467670734874, "grad_norm": 0.08973264694213867, "kl": 0.36194442212581635, "learning_rate": 8.826703070453015e-05, "loss": 0.0002, "reward": 0.8511719219386578, "reward_std": 0.316540764644742, "rewards/argmax_reward_func": 0.65625, "rewards/format_reward_func": 0.19492187909781933, "step": 219 }, { "completion_length": 736.953125, "epoch": 0.45702414957153986, "grad_norm": 0.06507878005504608, "kl": 0.32681479677557945, "learning_rate": 8.816118159720156e-05, "loss": 0.0002, "reward": 0.8093750439584255, "reward_std": 0.19887377880513668, "rewards/argmax_reward_func": 0.609375, "rewards/format_reward_func": 0.20000000298023224, "step": 220 }, { "completion_length": 686.046875, "epoch": 0.4591015320695923, "grad_norm": 0.08443711698055267, "kl": 0.27842542715370655, "learning_rate": 8.805492124803331e-05, "loss": 0.0001, "reward": 0.7750000506639481, "reward_std": 0.2474873699247837, "rewards/argmax_reward_func": 0.578125, "rewards/format_reward_func": 0.19687500223517418, "step": 221 }, { "completion_length": 636.28125, "epoch": 0.46117891456764476, "grad_norm": 0.08033400774002075, "kl": 0.2751711644232273, "learning_rate": 8.794825080213414e-05, "loss": 0.0001, "reward": 0.7781250476837158, "reward_std": 0.24306795187294483, "rewards/argmax_reward_func": 0.578125, "rewards/format_reward_func": 0.20000000298023224, "step": 222 }, { "completion_length": 652.40625, "epoch": 0.4632562970656972, "grad_norm": 0.07973612844944, "kl": 0.28703486546874046, "learning_rate": 8.78411714090321e-05, "loss": 0.0001, "reward": 0.7937500439584255, "reward_std": 0.2651650384068489, "rewards/argmax_reward_func": 0.59375, "rewards/format_reward_func": 0.20000000298023224, "step": 223 }, { "completion_length": 656.5, "epoch": 0.46533367956374966, "grad_norm": 0.0935521349310875, "kl": 0.28887104988098145, "learning_rate": 8.77336842226623e-05, "loss": 0.0001, "reward": 0.7937500439584255, "reward_std": 0.3535533845424652, "rewards/argmax_reward_func": 0.59375, "rewards/format_reward_func": 0.20000000298023224, "step": 224 }, { "completion_length": 601.90625, "epoch": 0.4674110620618021, "grad_norm": 0.08775703608989716, "kl": 0.2865128982812166, "learning_rate": 8.76257904013544e-05, "loss": 0.0001, "reward": 0.7468750439584255, "reward_std": 0.24306795001029968, "rewards/argmax_reward_func": 0.546875, "rewards/format_reward_func": 0.20000000298023224, "step": 225 }, { "completion_length": 649.09375, "epoch": 0.46948844455985456, "grad_norm": 0.07471180707216263, "kl": 0.3112582378089428, "learning_rate": 8.751749110782012e-05, "loss": 0.0002, "reward": 0.8875000476837158, "reward_std": 0.17677669040858746, "rewards/argmax_reward_func": 0.6875, "rewards/format_reward_func": 0.20000000298023224, "step": 226 }, { "completion_length": 651.375, "epoch": 0.47156582705790706, "grad_norm": 0.08434654772281647, "kl": 0.33592014387249947, "learning_rate": 8.740878750914076e-05, "loss": 0.0002, "reward": 0.8390625491738319, "reward_std": 0.24527766555547714, "rewards/argmax_reward_func": 0.640625, "rewards/format_reward_func": 0.1984375026077032, "step": 227 }, { "completion_length": 592.15625, "epoch": 0.4736432095559595, "grad_norm": 0.10138159990310669, "kl": 0.3475854229182005, "learning_rate": 8.729968077675454e-05, "loss": 0.0002, "reward": 0.8093750439584255, "reward_std": 0.33145629428327084, "rewards/argmax_reward_func": 0.609375, "rewards/format_reward_func": 0.20000000298023224, "step": 228 }, { "completion_length": 618.953125, "epoch": 0.47572059205401196, "grad_norm": 0.08923006802797318, "kl": 0.32317574694752693, "learning_rate": 8.71901720864441e-05, "loss": 0.0002, "reward": 0.8085937947034836, "reward_std": 0.28836698085069656, "rewards/argmax_reward_func": 0.609375, "rewards/format_reward_func": 0.1992187537252903, "step": 229 }, { "completion_length": 606.46875, "epoch": 0.4777979745520644, "grad_norm": 0.07547228038311005, "kl": 0.4202072508633137, "learning_rate": 8.70802626183237e-05, "loss": 0.0002, "reward": 0.7757812924683094, "reward_std": 0.20218834839761257, "rewards/argmax_reward_func": 0.578125, "rewards/format_reward_func": 0.19765625149011612, "step": 230 }, { "completion_length": 567.375, "epoch": 0.47987535705011686, "grad_norm": 0.07534275949001312, "kl": 0.5509752966463566, "learning_rate": 8.696995355682656e-05, "loss": 0.0003, "reward": 0.8250000439584255, "reward_std": 0.1767766922712326, "rewards/argmax_reward_func": 0.625, "rewards/format_reward_func": 0.20000000298023224, "step": 231 }, { "completion_length": 615.265625, "epoch": 0.4819527395481693, "grad_norm": 0.08038201183080673, "kl": 0.3771616071462631, "learning_rate": 8.685924609069214e-05, "loss": 0.0002, "reward": 0.8695312887430191, "reward_std": 0.20218834280967712, "rewards/argmax_reward_func": 0.671875, "rewards/format_reward_func": 0.19765625149011612, "step": 232 }, { "completion_length": 602.453125, "epoch": 0.48403012204622176, "grad_norm": 0.07698789983987808, "kl": 0.6121297106146812, "learning_rate": 8.674814141295324e-05, "loss": 0.0003, "reward": 0.8718750439584255, "reward_std": 0.19887377694249153, "rewards/argmax_reward_func": 0.671875, "rewards/format_reward_func": 0.20000000298023224, "step": 233 }, { "completion_length": 592.75, "epoch": 0.4861075045442742, "grad_norm": 0.09831973165273666, "kl": 0.31690799072384834, "learning_rate": 8.663664072092323e-05, "loss": 0.0002, "reward": 0.8246094211935997, "reward_std": 0.3099116366356611, "rewards/argmax_reward_func": 0.625, "rewards/format_reward_func": 0.19960937835276127, "step": 234 }, { "completion_length": 558.90625, "epoch": 0.48818488704232665, "grad_norm": 0.09684620797634125, "kl": 0.3237866424024105, "learning_rate": 8.652474521618306e-05, "loss": 0.0002, "reward": 0.7937500439584255, "reward_std": 0.3093592096120119, "rewards/argmax_reward_func": 0.59375, "rewards/format_reward_func": 0.20000000298023224, "step": 235 }, { "completion_length": 626.078125, "epoch": 0.4902622695403791, "grad_norm": 0.06933271139860153, "kl": 0.3663709722459316, "learning_rate": 8.641245610456838e-05, "loss": 0.0002, "reward": 0.9812500476837158, "reward_std": 0.1767766922712326, "rewards/argmax_reward_func": 0.78125, "rewards/format_reward_func": 0.20000000298023224, "step": 236 }, { "completion_length": 579.40625, "epoch": 0.49233965203843155, "grad_norm": 0.08088324964046478, "kl": 0.3435916490852833, "learning_rate": 8.629977459615655e-05, "loss": 0.0002, "reward": 0.8718750476837158, "reward_std": 0.19887377694249153, "rewards/argmax_reward_func": 0.671875, "rewards/format_reward_func": 0.20000000298023224, "step": 237 }, { "completion_length": 607.125, "epoch": 0.49441703453648406, "grad_norm": 0.07071245461702347, "kl": 0.2806865181773901, "learning_rate": 8.618670190525352e-05, "loss": 0.0001, "reward": 0.8250000439584255, "reward_std": 0.1767766922712326, "rewards/argmax_reward_func": 0.625, "rewards/format_reward_func": 0.20000000298023224, "step": 238 }, { "completion_length": 558.390625, "epoch": 0.4964944170345365, "grad_norm": 0.08282584697008133, "kl": 0.40584639832377434, "learning_rate": 8.607323925038082e-05, "loss": 0.0002, "reward": 0.7156250439584255, "reward_std": 0.19887377694249153, "rewards/argmax_reward_func": 0.515625, "rewards/format_reward_func": 0.20000000298023224, "step": 239 }, { "completion_length": 667.71875, "epoch": 0.49857179953258896, "grad_norm": 0.08190900087356567, "kl": 0.35680179484188557, "learning_rate": 8.595938785426241e-05, "loss": 0.0002, "reward": 0.9343750476837158, "reward_std": 0.24306794814765453, "rewards/argmax_reward_func": 0.734375, "rewards/format_reward_func": 0.20000000298023224, "step": 240 }, { "completion_length": 670.8125, "epoch": 0.5006491820306413, "grad_norm": 0.08591850101947784, "kl": 0.3619570918381214, "learning_rate": 8.584514894381151e-05, "loss": 0.0002, "reward": 0.8250000476837158, "reward_std": 0.26516503654420376, "rewards/argmax_reward_func": 0.625, "rewards/format_reward_func": 0.20000000298023224, "step": 241 }, { "completion_length": 676.25, "epoch": 0.5027265645286938, "grad_norm": 0.08506251126527786, "kl": 0.3224334083497524, "learning_rate": 8.573052375011733e-05, "loss": 0.0002, "reward": 0.8867187947034836, "reward_std": 0.2662698905915022, "rewards/argmax_reward_func": 0.6875, "rewards/format_reward_func": 0.1992187537252903, "step": 242 }, { "completion_length": 673.1875, "epoch": 0.5048039470267462, "grad_norm": 0.06150234118103981, "kl": 0.3478453829884529, "learning_rate": 8.561551350843186e-05, "loss": 0.0002, "reward": 0.9656250476837158, "reward_std": 0.15467960573732853, "rewards/argmax_reward_func": 0.765625, "rewards/format_reward_func": 0.20000000298023224, "step": 243 }, { "completion_length": 692.546875, "epoch": 0.5068813295247988, "grad_norm": 0.06618204712867737, "kl": 0.29330621659755707, "learning_rate": 8.550011945815655e-05, "loss": 0.0001, "reward": 0.8562500476837158, "reward_std": 0.1767766922712326, "rewards/argmax_reward_func": 0.65625, "rewards/format_reward_func": 0.20000000298023224, "step": 244 }, { "completion_length": 687.03125, "epoch": 0.5089587120228513, "grad_norm": 0.07623764872550964, "kl": 0.3498356007039547, "learning_rate": 8.538434284282892e-05, "loss": 0.0002, "reward": 0.7937500476837158, "reward_std": 0.22097086533904076, "rewards/argmax_reward_func": 0.59375, "rewards/format_reward_func": 0.20000000298023224, "step": 245 }, { "completion_length": 660.703125, "epoch": 0.5110360945209037, "grad_norm": 0.04135030135512352, "kl": 0.32844917103648186, "learning_rate": 8.526818491010922e-05, "loss": 0.0002, "reward": 0.9187500476837158, "reward_std": 0.0883883461356163, "rewards/argmax_reward_func": 0.71875, "rewards/format_reward_func": 0.20000000298023224, "step": 246 }, { "completion_length": 716.609375, "epoch": 0.5131134770189562, "grad_norm": 0.0865129679441452, "kl": 0.3059841375797987, "learning_rate": 8.515164691176687e-05, "loss": 0.0002, "reward": 0.7312500439584255, "reward_std": 0.3093592096120119, "rewards/argmax_reward_func": 0.53125, "rewards/format_reward_func": 0.20000000298023224, "step": 247 }, { "completion_length": 757.4375, "epoch": 0.5151908595170086, "grad_norm": 0.07157998532056808, "kl": 0.2929275669157505, "learning_rate": 8.503473010366713e-05, "loss": 0.0001, "reward": 0.8867187909781933, "reward_std": 0.22207572311162949, "rewards/argmax_reward_func": 0.6875, "rewards/format_reward_func": 0.19921875186264515, "step": 248 }, { "completion_length": 662.859375, "epoch": 0.517268242015061, "grad_norm": 0.06820650398731232, "kl": 0.31902188807725906, "learning_rate": 8.491743574575743e-05, "loss": 0.0002, "reward": 0.7781250476837158, "reward_std": 0.19887377880513668, "rewards/argmax_reward_func": 0.578125, "rewards/format_reward_func": 0.20000000298023224, "step": 249 }, { "completion_length": 721.625, "epoch": 0.5193456245131135, "grad_norm": 0.0756940096616745, "kl": 0.31403973512351513, "learning_rate": 8.479976510205387e-05, "loss": 0.0002, "reward": 0.9187500476837158, "reward_std": 0.22097086161375046, "rewards/argmax_reward_func": 0.71875, "rewards/format_reward_func": 0.20000000298023224, "step": 250 }, { "completion_length": 748.046875, "epoch": 0.521423007011166, "grad_norm": 0.07090619206428528, "kl": 0.2569838650524616, "learning_rate": 8.468171944062755e-05, "loss": 0.0001, "reward": 0.7929687947034836, "reward_std": 0.22207572311162949, "rewards/argmax_reward_func": 0.59375, "rewards/format_reward_func": 0.19921875186264515, "step": 251 }, { "completion_length": 693.609375, "epoch": 0.5235003895092184, "grad_norm": 0.06538081914186478, "kl": 0.2928556613624096, "learning_rate": 8.456330003359093e-05, "loss": 0.0001, "reward": 0.8250000476837158, "reward_std": 0.1767766922712326, "rewards/argmax_reward_func": 0.625, "rewards/format_reward_func": 0.20000000298023224, "step": 252 }, { "completion_length": 728.953125, "epoch": 0.5255777720072708, "grad_norm": 0.09474781900644302, "kl": 0.27900537475943565, "learning_rate": 8.444450815708415e-05, "loss": 0.0001, "reward": 0.8250000476837158, "reward_std": 0.3977475557476282, "rewards/argmax_reward_func": 0.625, "rewards/format_reward_func": 0.20000000298023224, "step": 253 }, { "completion_length": 737.625, "epoch": 0.5276551545053233, "grad_norm": 0.06914320588111877, "kl": 0.26191011257469654, "learning_rate": 8.432534509126122e-05, "loss": 0.0001, "reward": 0.7468750439584255, "reward_std": 0.19887377880513668, "rewards/argmax_reward_func": 0.546875, "rewards/format_reward_func": 0.20000000298023224, "step": 254 }, { "completion_length": 743.609375, "epoch": 0.5297325370033757, "grad_norm": 0.05855982005596161, "kl": 0.26190576888620853, "learning_rate": 8.420581212027624e-05, "loss": 0.0001, "reward": 0.8875000476837158, "reward_std": 0.1767766922712326, "rewards/argmax_reward_func": 0.6875, "rewards/format_reward_func": 0.20000000298023224, "step": 255 }, { "completion_length": 768.265625, "epoch": 0.5318099195014282, "grad_norm": 0.058118585497140884, "kl": 0.2930552177131176, "learning_rate": 8.408591053226964e-05, "loss": 0.0001, "reward": 0.9492187947034836, "reward_std": 0.13368737325072289, "rewards/argmax_reward_func": 0.75, "rewards/format_reward_func": 0.1992187537252903, "step": 256 }, { "completion_length": 763.0, "epoch": 0.5338873019994806, "grad_norm": 0.07115372270345688, "kl": 0.35762836039066315, "learning_rate": 8.396564161935411e-05, "loss": 0.0002, "reward": 0.8710937947034836, "reward_std": 0.1999786328524351, "rewards/argmax_reward_func": 0.671875, "rewards/format_reward_func": 0.1992187537252903, "step": 257 }, { "completion_length": 865.8125, "epoch": 0.5359646844975331, "grad_norm": 0.06384899467229843, "kl": 0.2847513500601053, "learning_rate": 8.38450066776009e-05, "loss": 0.0001, "reward": 0.8375000506639481, "reward_std": 0.2032931987196207, "rewards/argmax_reward_func": 0.640625, "rewards/format_reward_func": 0.19687500223517418, "step": 258 }, { "completion_length": 691.734375, "epoch": 0.5380420669955855, "grad_norm": 0.08122014999389648, "kl": 0.2869179602712393, "learning_rate": 8.37240070070257e-05, "loss": 0.0001, "reward": 0.7937500439584255, "reward_std": 0.30935921147465706, "rewards/argmax_reward_func": 0.59375, "rewards/format_reward_func": 0.20000000298023224, "step": 259 }, { "completion_length": 759.203125, "epoch": 0.540119449493638, "grad_norm": 0.06432370841503143, "kl": 0.3190025221556425, "learning_rate": 8.360264391157471e-05, "loss": 0.0002, "reward": 0.9500000476837158, "reward_std": 0.1767766922712326, "rewards/argmax_reward_func": 0.75, "rewards/format_reward_func": 0.20000000298023224, "step": 260 }, { "completion_length": 872.890625, "epoch": 0.5421968319916904, "grad_norm": 0.08087541162967682, "kl": 0.2903926521539688, "learning_rate": 8.348091869911054e-05, "loss": 0.0001, "reward": 0.7554687969386578, "reward_std": 0.27510873042047024, "rewards/argmax_reward_func": 0.5625, "rewards/format_reward_func": 0.19296875409781933, "step": 261 }, { "completion_length": 868.875, "epoch": 0.5442742144897429, "grad_norm": 0.06983164697885513, "kl": 0.25976957008242607, "learning_rate": 8.335883268139813e-05, "loss": 0.0001, "reward": 0.8062500506639481, "reward_std": 0.247487373650074, "rewards/argmax_reward_func": 0.609375, "rewards/format_reward_func": 0.19687500223517418, "step": 262 }, { "completion_length": 781.53125, "epoch": 0.5463515969877953, "grad_norm": 0.07666690647602081, "kl": 0.286643173545599, "learning_rate": 8.323638717409061e-05, "loss": 0.0001, "reward": 0.8406250476837158, "reward_std": 0.24306795001029968, "rewards/argmax_reward_func": 0.640625, "rewards/format_reward_func": 0.20000000298023224, "step": 263 }, { "completion_length": 768.796875, "epoch": 0.5484289794858478, "grad_norm": 0.06480922549962997, "kl": 0.30170151591300964, "learning_rate": 8.311358349671517e-05, "loss": 0.0002, "reward": 0.7625000476837158, "reward_std": 0.1767766922712326, "rewards/argmax_reward_func": 0.5625, "rewards/format_reward_func": 0.20000000298023224, "step": 264 }, { "completion_length": 874.328125, "epoch": 0.5505063619839002, "grad_norm": 0.06416033208370209, "kl": 0.28673115372657776, "learning_rate": 8.299042297265876e-05, "loss": 0.0001, "reward": 0.8843750506639481, "reward_std": 0.22539028525352478, "rewards/argmax_reward_func": 0.6875, "rewards/format_reward_func": 0.19687500223517418, "step": 265 }, { "completion_length": 790.890625, "epoch": 0.5525837444819527, "grad_norm": 0.06319725513458252, "kl": 0.3224434554576874, "learning_rate": 8.286690692915386e-05, "loss": 0.0002, "reward": 0.8562500476837158, "reward_std": 0.1767766922712326, "rewards/argmax_reward_func": 0.65625, "rewards/format_reward_func": 0.20000000298023224, "step": 266 }, { "completion_length": 837.4375, "epoch": 0.5546611269800052, "grad_norm": 0.07317644357681274, "kl": 0.3563056066632271, "learning_rate": 8.274303669726426e-05, "loss": 0.0002, "reward": 0.7906250432133675, "reward_std": 0.22539028525352478, "rewards/argmax_reward_func": 0.59375, "rewards/format_reward_func": 0.19687500223517418, "step": 267 }, { "completion_length": 718.125, "epoch": 0.5567385094780577, "grad_norm": 0.06230226531624794, "kl": 0.30831460282206535, "learning_rate": 8.261881361187054e-05, "loss": 0.0002, "reward": 0.9500000476837158, "reward_std": 0.17677669040858746, "rewards/argmax_reward_func": 0.75, "rewards/format_reward_func": 0.20000000298023224, "step": 268 }, { "completion_length": 879.25, "epoch": 0.5588158919761101, "grad_norm": 0.07465776056051254, "kl": 0.47362302988767624, "learning_rate": 8.249423901165584e-05, "loss": 0.0002, "reward": 0.8535156697034836, "reward_std": 0.22483785264194012, "rewards/argmax_reward_func": 0.65625, "rewards/format_reward_func": 0.19726562686264515, "step": 269 }, { "completion_length": 669.40625, "epoch": 0.5608932744741626, "grad_norm": 0.07374807447195053, "kl": 0.329727228730917, "learning_rate": 8.236931423909138e-05, "loss": 0.0002, "reward": 0.7773437947034836, "reward_std": 0.19997863098978996, "rewards/argmax_reward_func": 0.578125, "rewards/format_reward_func": 0.1992187537252903, "step": 270 }, { "completion_length": 732.3125, "epoch": 0.562970656972215, "grad_norm": 0.0676698312163353, "kl": 0.3591331150382757, "learning_rate": 8.2244040640422e-05, "loss": 0.0002, "reward": 0.9187500476837158, "reward_std": 0.17677669040858746, "rewards/argmax_reward_func": 0.71875, "rewards/format_reward_func": 0.20000000298023224, "step": 271 }, { "completion_length": 840.921875, "epoch": 0.5650480394702675, "grad_norm": 0.0682259052991867, "kl": 0.35003719478845596, "learning_rate": 8.21184195656516e-05, "loss": 0.0002, "reward": 0.8562500476837158, "reward_std": 0.22097086533904076, "rewards/argmax_reward_func": 0.65625, "rewards/format_reward_func": 0.20000000298023224, "step": 272 }, { "completion_length": 722.375, "epoch": 0.5671254219683199, "grad_norm": 0.05751950666308403, "kl": 0.37356993556022644, "learning_rate": 8.199245236852871e-05, "loss": 0.0002, "reward": 0.6843750476837158, "reward_std": 0.15467960573732853, "rewards/argmax_reward_func": 0.484375, "rewards/format_reward_func": 0.20000000298023224, "step": 273 }, { "completion_length": 679.171875, "epoch": 0.5692028044663724, "grad_norm": 0.09351193159818649, "kl": 0.3799058124423027, "learning_rate": 8.186614040653176e-05, "loss": 0.0002, "reward": 0.9343750476837158, "reward_std": 0.33145629428327084, "rewards/argmax_reward_func": 0.734375, "rewards/format_reward_func": 0.20000000298023224, "step": 274 }, { "completion_length": 767.84375, "epoch": 0.5712801869644248, "grad_norm": 0.06785906106233597, "kl": 0.3164171427488327, "learning_rate": 8.173948504085454e-05, "loss": 0.0002, "reward": 0.8242187947034836, "reward_std": 0.17788154631853104, "rewards/argmax_reward_func": 0.625, "rewards/format_reward_func": 0.19921875186264515, "step": 275 }, { "completion_length": 700.25, "epoch": 0.5733575694624773, "grad_norm": 0.06914710998535156, "kl": 0.3422697074711323, "learning_rate": 8.161248763639153e-05, "loss": 0.0002, "reward": 0.8406250476837158, "reward_std": 0.19887377694249153, "rewards/argmax_reward_func": 0.640625, "rewards/format_reward_func": 0.20000000298023224, "step": 276 }, { "completion_length": 662.9375, "epoch": 0.5754349519605297, "grad_norm": 0.05800582095980644, "kl": 0.37523847445845604, "learning_rate": 8.148514956172315e-05, "loss": 0.0002, "reward": 0.8562500476837158, "reward_std": 0.1325825173407793, "rewards/argmax_reward_func": 0.65625, "rewards/format_reward_func": 0.20000000298023224, "step": 277 }, { "completion_length": 834.5625, "epoch": 0.5775123344585822, "grad_norm": 0.06169675290584564, "kl": 0.31875982135534286, "learning_rate": 8.135747218910104e-05, "loss": 0.0002, "reward": 0.8367187976837158, "reward_std": 0.20439805276691914, "rewards/argmax_reward_func": 0.640625, "rewards/format_reward_func": 0.1960937511175871, "step": 278 }, { "completion_length": 766.546875, "epoch": 0.5795897169566346, "grad_norm": 0.08040869235992432, "kl": 1.0613461509346962, "learning_rate": 8.122945689443328e-05, "loss": 0.0005, "reward": 0.8703125417232513, "reward_std": 0.1568893175572157, "rewards/argmax_reward_func": 0.671875, "rewards/format_reward_func": 0.19843750074505806, "step": 279 }, { "completion_length": 737.5625, "epoch": 0.5816670994546871, "grad_norm": 0.0702415257692337, "kl": 0.34963829442858696, "learning_rate": 8.11011050572695e-05, "loss": 0.0002, "reward": 0.8222656659781933, "reward_std": 0.22483785450458527, "rewards/argmax_reward_func": 0.625, "rewards/format_reward_func": 0.19726562686264515, "step": 280 }, { "completion_length": 753.703125, "epoch": 0.5837444819527395, "grad_norm": 0.07661338895559311, "kl": 0.38233664259314537, "learning_rate": 8.097241806078615e-05, "loss": 0.0002, "reward": 0.7929687909781933, "reward_std": 0.26626989245414734, "rewards/argmax_reward_func": 0.59375, "rewards/format_reward_func": 0.19921875186264515, "step": 281 }, { "completion_length": 880.609375, "epoch": 0.585821864450792, "grad_norm": 0.07432933151721954, "kl": 0.42199838161468506, "learning_rate": 8.084339729177142e-05, "loss": 0.0002, "reward": 0.8500000461935997, "reward_std": 0.27400387451052666, "rewards/argmax_reward_func": 0.65625, "rewards/format_reward_func": 0.19375000335276127, "step": 282 }, { "completion_length": 778.453125, "epoch": 0.5878992469488444, "grad_norm": 0.07835783809423447, "kl": 0.36370869539678097, "learning_rate": 8.071404414061041e-05, "loss": 0.0002, "reward": 0.8207031637430191, "reward_std": 0.2712417396251112, "rewards/argmax_reward_func": 0.625, "rewards/format_reward_func": 0.19570313021540642, "step": 283 }, { "completion_length": 806.515625, "epoch": 0.5899766294468969, "grad_norm": 0.048540204763412476, "kl": 0.3912508450448513, "learning_rate": 8.058436000127014e-05, "loss": 0.0002, "reward": 0.8679687865078449, "reward_std": 0.1602038759738207, "rewards/argmax_reward_func": 0.671875, "rewards/format_reward_func": 0.1960937511175871, "step": 284 }, { "completion_length": 859.765625, "epoch": 0.5920540119449493, "grad_norm": 0.06207654997706413, "kl": 0.31765272468328476, "learning_rate": 8.045434627128446e-05, "loss": 0.0002, "reward": 0.9312500506639481, "reward_std": 0.2032931987196207, "rewards/argmax_reward_func": 0.734375, "rewards/format_reward_func": 0.19687500223517418, "step": 285 }, { "completion_length": 710.453125, "epoch": 0.5941313944430018, "grad_norm": 0.08810100704431534, "kl": 0.40968091040849686, "learning_rate": 8.032400435173907e-05, "loss": 0.0002, "reward": 0.8542969226837158, "reward_std": 0.31212134286761284, "rewards/argmax_reward_func": 0.65625, "rewards/format_reward_func": 0.19804687798023224, "step": 286 }, { "completion_length": 700.296875, "epoch": 0.5962087769410542, "grad_norm": 0.07407598942518234, "kl": 0.3017115257680416, "learning_rate": 8.019333564725639e-05, "loss": 0.0002, "reward": 0.9476562887430191, "reward_std": 0.18009125301614404, "rewards/argmax_reward_func": 0.75, "rewards/format_reward_func": 0.19765625521540642, "step": 287 }, { "completion_length": 628.984375, "epoch": 0.5982861594391067, "grad_norm": 0.05131203308701515, "kl": 0.3888060562312603, "learning_rate": 8.006234156598042e-05, "loss": 0.0002, "reward": 0.7625000439584255, "reward_std": 0.0883883461356163, "rewards/argmax_reward_func": 0.5625, "rewards/format_reward_func": 0.20000000298023224, "step": 288 }, { "completion_length": 648.28125, "epoch": 0.6003635419371591, "grad_norm": 0.07319964468479156, "kl": 0.3936074487864971, "learning_rate": 7.99310235195615e-05, "loss": 0.0002, "reward": 0.9031250476837158, "reward_std": 0.19887377880513668, "rewards/argmax_reward_func": 0.703125, "rewards/format_reward_func": 0.20000000298023224, "step": 289 }, { "completion_length": 788.953125, "epoch": 0.6024409244352117, "grad_norm": 0.07722538709640503, "kl": 0.35653146356344223, "learning_rate": 7.979938292314129e-05, "loss": 0.0002, "reward": 0.8386719189584255, "reward_std": 0.24583008512854576, "rewards/argmax_reward_func": 0.640625, "rewards/format_reward_func": 0.19804687798023224, "step": 290 }, { "completion_length": 679.46875, "epoch": 0.6045183069332641, "grad_norm": 0.03349410742521286, "kl": 0.35145866870880127, "learning_rate": 7.966742119533723e-05, "loss": 0.0002, "reward": 0.9187500476837158, "reward_std": 0.04419417306780815, "rewards/argmax_reward_func": 0.71875, "rewards/format_reward_func": 0.20000000298023224, "step": 291 }, { "completion_length": 761.25, "epoch": 0.6065956894313166, "grad_norm": 0.06922980397939682, "kl": 0.33772632107138634, "learning_rate": 7.953513975822755e-05, "loss": 0.0002, "reward": 0.8242187947034836, "reward_std": 0.2220757193863392, "rewards/argmax_reward_func": 0.625, "rewards/format_reward_func": 0.19921875186264515, "step": 292 }, { "completion_length": 618.25, "epoch": 0.608673071929369, "grad_norm": 0.07786116003990173, "kl": 0.5136113204061985, "learning_rate": 7.940254003733578e-05, "loss": 0.0003, "reward": 0.7781250476837158, "reward_std": 0.24306795187294483, "rewards/argmax_reward_func": 0.578125, "rewards/format_reward_func": 0.20000000298023224, "step": 293 }, { "completion_length": 704.921875, "epoch": 0.6107504544274215, "grad_norm": 0.0848776176571846, "kl": 0.4174853079020977, "learning_rate": 7.926962346161535e-05, "loss": 0.0002, "reward": 0.699218787252903, "reward_std": 0.22207571775652468, "rewards/argmax_reward_func": 0.5, "rewards/format_reward_func": 0.1992187537252903, "step": 294 }, { "completion_length": 657.734375, "epoch": 0.6128278369254739, "grad_norm": 0.0675949826836586, "kl": 0.4570797383785248, "learning_rate": 7.913639146343435e-05, "loss": 0.0002, "reward": 0.7937500439584255, "reward_std": 0.1767766922712326, "rewards/argmax_reward_func": 0.59375, "rewards/format_reward_func": 0.20000000298023224, "step": 295 }, { "completion_length": 689.328125, "epoch": 0.6149052194235264, "grad_norm": 0.07435144484043121, "kl": 0.3593181371688843, "learning_rate": 7.900284547855991e-05, "loss": 0.0002, "reward": 0.8691406697034836, "reward_std": 0.2469349391758442, "rewards/argmax_reward_func": 0.671875, "rewards/format_reward_func": 0.19726562686264515, "step": 296 }, { "completion_length": 783.453125, "epoch": 0.6169826019215788, "grad_norm": 0.07517191022634506, "kl": 0.7363171242177486, "learning_rate": 7.886898694614291e-05, "loss": 0.0004, "reward": 0.8375000469386578, "reward_std": 0.20329319685697556, "rewards/argmax_reward_func": 0.640625, "rewards/format_reward_func": 0.19687500223517418, "step": 297 }, { "completion_length": 665.171875, "epoch": 0.6190599844196313, "grad_norm": 0.07602944225072861, "kl": 0.4283002242445946, "learning_rate": 7.873481730870232e-05, "loss": 0.0002, "reward": 0.7781250439584255, "reward_std": 0.24306795001029968, "rewards/argmax_reward_func": 0.578125, "rewards/format_reward_func": 0.20000000298023224, "step": 298 }, { "completion_length": 741.75, "epoch": 0.6211373669176837, "grad_norm": 0.07438351958990097, "kl": 0.2955322675406933, "learning_rate": 7.860033801210976e-05, "loss": 0.0001, "reward": 0.8250000439584255, "reward_std": 0.2209708634763956, "rewards/argmax_reward_func": 0.625, "rewards/format_reward_func": 0.20000000298023224, "step": 299 }, { "completion_length": 719.9375, "epoch": 0.6232147494157362, "grad_norm": 0.08875050395727158, "kl": 0.34281647577881813, "learning_rate": 7.84655505055738e-05, "loss": 0.0002, "reward": 0.7125000432133675, "reward_std": 0.38006988912820816, "rewards/argmax_reward_func": 0.515625, "rewards/format_reward_func": 0.19687500409781933, "step": 300 }, { "completion_length": 755.453125, "epoch": 0.6252921319137886, "grad_norm": 0.07758081704378128, "kl": 0.29608317092061043, "learning_rate": 7.833045624162452e-05, "loss": 0.0001, "reward": 0.7781250476837158, "reward_std": 0.24306795001029968, "rewards/argmax_reward_func": 0.578125, "rewards/format_reward_func": 0.20000000298023224, "step": 301 }, { "completion_length": 721.40625, "epoch": 0.6273695144118411, "grad_norm": 0.07114533334970474, "kl": 0.5064779743552208, "learning_rate": 7.819505667609767e-05, "loss": 0.0003, "reward": 0.7468750439584255, "reward_std": 0.19887377880513668, "rewards/argmax_reward_func": 0.546875, "rewards/format_reward_func": 0.20000000298023224, "step": 302 }, { "completion_length": 804.59375, "epoch": 0.6294468969098935, "grad_norm": 0.06897041946649551, "kl": 0.3391858469694853, "learning_rate": 7.805935326811912e-05, "loss": 0.0002, "reward": 0.9500000476837158, "reward_std": 0.22097086533904076, "rewards/argmax_reward_func": 0.75, "rewards/format_reward_func": 0.20000000298023224, "step": 303 }, { "completion_length": 723.96875, "epoch": 0.631524279407946, "grad_norm": 0.07760775089263916, "kl": 0.3714125622063875, "learning_rate": 7.792334748008905e-05, "loss": 0.0002, "reward": 0.8875000476837158, "reward_std": 0.26516503654420376, "rewards/argmax_reward_func": 0.6875, "rewards/format_reward_func": 0.20000000298023224, "step": 304 }, { "completion_length": 694.328125, "epoch": 0.6336016619059984, "grad_norm": 0.08604968339204788, "kl": 0.3233291208744049, "learning_rate": 7.77870407776662e-05, "loss": 0.0002, "reward": 0.7000000476837158, "reward_std": 0.3093592058867216, "rewards/argmax_reward_func": 0.5, "rewards/format_reward_func": 0.20000000298023224, "step": 305 }, { "completion_length": 875.234375, "epoch": 0.6356790444040509, "grad_norm": 0.07271739840507507, "kl": 0.2942599691450596, "learning_rate": 7.765043462975217e-05, "loss": 0.0001, "reward": 0.7464844100177288, "reward_std": 0.1817485373467207, "rewards/argmax_reward_func": 0.5625, "rewards/format_reward_func": 0.18398437649011612, "step": 306 }, { "completion_length": 674.65625, "epoch": 0.6377564269021033, "grad_norm": 0.06499814242124557, "kl": 0.6551753357052803, "learning_rate": 7.751353050847545e-05, "loss": 0.0003, "reward": 0.6683594062924385, "reward_std": 0.13313494622707367, "rewards/argmax_reward_func": 0.46875, "rewards/format_reward_func": 0.19960937649011612, "step": 307 }, { "completion_length": 794.671875, "epoch": 0.6398338094001558, "grad_norm": 0.07812398672103882, "kl": 0.29106237180531025, "learning_rate": 7.737632988917564e-05, "loss": 0.0001, "reward": 0.8218750506639481, "reward_std": 0.3137786276638508, "rewards/argmax_reward_func": 0.625, "rewards/format_reward_func": 0.19687500223517418, "step": 308 }, { "completion_length": 726.0, "epoch": 0.6419111918982082, "grad_norm": 0.08285919576883316, "kl": 0.3495354764163494, "learning_rate": 7.723883425038758e-05, "loss": 0.0002, "reward": 0.7625000439584255, "reward_std": 0.26516503654420376, "rewards/argmax_reward_func": 0.5625, "rewards/format_reward_func": 0.20000000298023224, "step": 309 }, { "completion_length": 862.09375, "epoch": 0.6439885743962607, "grad_norm": 0.06892167776823044, "kl": 0.318182036280632, "learning_rate": 7.710104507382531e-05, "loss": 0.0002, "reward": 0.7753906697034836, "reward_std": 0.24693494103848934, "rewards/argmax_reward_func": 0.578125, "rewards/format_reward_func": 0.19726562686264515, "step": 310 }, { "completion_length": 705.40625, "epoch": 0.6460659568943131, "grad_norm": 0.053015708923339844, "kl": 0.30275189504027367, "learning_rate": 7.696296384436619e-05, "loss": 0.0002, "reward": 0.7781250402331352, "reward_std": 0.11048543266952038, "rewards/argmax_reward_func": 0.578125, "rewards/format_reward_func": 0.20000000298023224, "step": 311 }, { "completion_length": 689.0, "epoch": 0.6481433393923656, "grad_norm": 0.08785798400640488, "kl": 0.3134246002882719, "learning_rate": 7.682459205003483e-05, "loss": 0.0002, "reward": 0.8093750439584255, "reward_std": 0.2872621212154627, "rewards/argmax_reward_func": 0.609375, "rewards/format_reward_func": 0.20000000298023224, "step": 312 }, { "completion_length": 875.421875, "epoch": 0.650220721890418, "grad_norm": 0.07502438127994537, "kl": 0.33200008049607277, "learning_rate": 7.668593118198719e-05, "loss": 0.0002, "reward": 0.8218750506639481, "reward_std": 0.26958445087075233, "rewards/argmax_reward_func": 0.625, "rewards/format_reward_func": 0.19687500409781933, "step": 313 }, { "completion_length": 708.828125, "epoch": 0.6522981043884706, "grad_norm": 0.08758591115474701, "kl": 0.3114005923271179, "learning_rate": 7.654698273449435e-05, "loss": 0.0002, "reward": 0.9179687947034836, "reward_std": 0.31046406738460064, "rewards/argmax_reward_func": 0.71875, "rewards/format_reward_func": 0.1992187537252903, "step": 314 }, { "completion_length": 843.875, "epoch": 0.654375486886523, "grad_norm": 0.06280484795570374, "kl": 0.2624143324792385, "learning_rate": 7.640774820492647e-05, "loss": 0.0001, "reward": 0.7937500439584255, "reward_std": 0.2209708634763956, "rewards/argmax_reward_func": 0.59375, "rewards/format_reward_func": 0.20000000298023224, "step": 315 }, { "completion_length": 676.296875, "epoch": 0.6564528693845755, "grad_norm": 0.06573140621185303, "kl": 0.29568540304899216, "learning_rate": 7.626822909373667e-05, "loss": 0.0001, "reward": 0.7781250402331352, "reward_std": 0.19887377880513668, "rewards/argmax_reward_func": 0.578125, "rewards/format_reward_func": 0.20000000298023224, "step": 316 }, { "completion_length": 708.625, "epoch": 0.6585302518826279, "grad_norm": 0.07694177329540253, "kl": 0.2915249727666378, "learning_rate": 7.612842690444486e-05, "loss": 0.0001, "reward": 0.9648437947034836, "reward_std": 0.2441728077828884, "rewards/argmax_reward_func": 0.765625, "rewards/format_reward_func": 0.1992187537252903, "step": 317 }, { "completion_length": 656.375, "epoch": 0.6606076343806804, "grad_norm": 0.09348881989717484, "kl": 0.3169392794370651, "learning_rate": 7.598834314362151e-05, "loss": 0.0002, "reward": 0.8093750476837158, "reward_std": 0.3314562924206257, "rewards/argmax_reward_func": 0.609375, "rewards/format_reward_func": 0.20000000298023224, "step": 318 }, { "completion_length": 640.75, "epoch": 0.6626850168787328, "grad_norm": 0.07280497252941132, "kl": 0.2968177553266287, "learning_rate": 7.584797932087145e-05, "loss": 0.0001, "reward": 0.8710937947034836, "reward_std": 0.19997863844037056, "rewards/argmax_reward_func": 0.671875, "rewards/format_reward_func": 0.19921875186264515, "step": 319 }, { "completion_length": 694.359375, "epoch": 0.6647623993767853, "grad_norm": 0.09892084449529648, "kl": 0.5367627218365669, "learning_rate": 7.570733694881755e-05, "loss": 0.0003, "reward": 0.9031250439584255, "reward_std": 0.28726212307810783, "rewards/argmax_reward_func": 0.703125, "rewards/format_reward_func": 0.20000000298023224, "step": 320 }, { "completion_length": 653.953125, "epoch": 0.6668397818748377, "grad_norm": 0.07763518393039703, "kl": 0.31165359169244766, "learning_rate": 7.556641754308447e-05, "loss": 0.0002, "reward": 0.8406250439584255, "reward_std": 0.24306794814765453, "rewards/argmax_reward_func": 0.640625, "rewards/format_reward_func": 0.20000000298023224, "step": 321 }, { "completion_length": 738.703125, "epoch": 0.6689171643728902, "grad_norm": 0.0881708562374115, "kl": 0.3136756382882595, "learning_rate": 7.542522262228231e-05, "loss": 0.0002, "reward": 0.8085937947034836, "reward_std": 0.33256115205585957, "rewards/argmax_reward_func": 0.609375, "rewards/format_reward_func": 0.1992187537252903, "step": 322 }, { "completion_length": 757.421875, "epoch": 0.6709945468709426, "grad_norm": 0.0727957934141159, "kl": 0.28189600445330143, "learning_rate": 7.528375370799024e-05, "loss": 0.0001, "reward": 0.8093750476837158, "reward_std": 0.24306795001029968, "rewards/argmax_reward_func": 0.609375, "rewards/format_reward_func": 0.20000000298023224, "step": 323 }, { "completion_length": 660.125, "epoch": 0.6730719293689951, "grad_norm": 0.068515844643116, "kl": 0.29710386879742146, "learning_rate": 7.514201232474011e-05, "loss": 0.0001, "reward": 0.8562500439584255, "reward_std": 0.2209708634763956, "rewards/argmax_reward_func": 0.65625, "rewards/format_reward_func": 0.20000000298023224, "step": 324 }, { "completion_length": 704.8125, "epoch": 0.6751493118670475, "grad_norm": 0.07097381353378296, "kl": 0.31055452302098274, "learning_rate": 7.500000000000001e-05, "loss": 0.0002, "reward": 0.8406250439584255, "reward_std": 0.24306795187294483, "rewards/argmax_reward_func": 0.640625, "rewards/format_reward_func": 0.20000000298023224, "step": 325 }, { "completion_length": 680.921875, "epoch": 0.6772266943651, "grad_norm": 0.06986773759126663, "kl": 0.3125472627580166, "learning_rate": 7.48577182641578e-05, "loss": 0.0002, "reward": 0.8250000476837158, "reward_std": 0.1767766922712326, "rewards/argmax_reward_func": 0.625, "rewards/format_reward_func": 0.20000000298023224, "step": 326 }, { "completion_length": 744.046875, "epoch": 0.6793040768631524, "grad_norm": 0.06576069444417953, "kl": 0.3361051678657532, "learning_rate": 7.471516865050467e-05, "loss": 0.0002, "reward": 0.9343750476837158, "reward_std": 0.15467960573732853, "rewards/argmax_reward_func": 0.734375, "rewards/format_reward_func": 0.20000000298023224, "step": 327 }, { "completion_length": 654.0, "epoch": 0.6813814593612049, "grad_norm": 0.07205154001712799, "kl": 0.30227479338645935, "learning_rate": 7.457235269521856e-05, "loss": 0.0002, "reward": 0.7617187909781933, "reward_std": 0.17788154468871653, "rewards/argmax_reward_func": 0.5625, "rewards/format_reward_func": 0.1992187537252903, "step": 328 }, { "completion_length": 647.859375, "epoch": 0.6834588418592573, "grad_norm": 0.0794130265712738, "kl": 0.4595659039914608, "learning_rate": 7.44292719373476e-05, "loss": 0.0002, "reward": 0.9343750476837158, "reward_std": 0.24306794628500938, "rewards/argmax_reward_func": 0.734375, "rewards/format_reward_func": 0.20000000298023224, "step": 329 }, { "completion_length": 735.078125, "epoch": 0.6855362243573098, "grad_norm": 0.0897228941321373, "kl": 0.3426021710038185, "learning_rate": 7.428592791879361e-05, "loss": 0.0002, "reward": 0.7781250439584255, "reward_std": 0.33145629800856113, "rewards/argmax_reward_func": 0.578125, "rewards/format_reward_func": 0.20000000298023224, "step": 330 }, { "completion_length": 853.703125, "epoch": 0.6876136068553622, "grad_norm": 0.07609646022319794, "kl": 0.26740806736052036, "learning_rate": 7.414232218429537e-05, "loss": 0.0001, "reward": 0.9156250506639481, "reward_std": 0.26958445832133293, "rewards/argmax_reward_func": 0.71875, "rewards/format_reward_func": 0.19687500223517418, "step": 331 }, { "completion_length": 616.890625, "epoch": 0.6896909893534147, "grad_norm": 0.09115231037139893, "kl": 0.334526427090168, "learning_rate": 7.399845628141206e-05, "loss": 0.0002, "reward": 0.8718750439584255, "reward_std": 0.2872621212154627, "rewards/argmax_reward_func": 0.671875, "rewards/format_reward_func": 0.20000000298023224, "step": 332 }, { "completion_length": 622.765625, "epoch": 0.6917683718514671, "grad_norm": 0.08646494895219803, "kl": 0.3055717647075653, "learning_rate": 7.385433176050653e-05, "loss": 0.0002, "reward": 0.8710937909781933, "reward_std": 0.2883669827133417, "rewards/argmax_reward_func": 0.671875, "rewards/format_reward_func": 0.19921875186264515, "step": 333 }, { "completion_length": 686.921875, "epoch": 0.6938457543495196, "grad_norm": 0.0787225142121315, "kl": 0.3159499131143093, "learning_rate": 7.370995017472863e-05, "loss": 0.0002, "reward": 0.8531250506639481, "reward_std": 0.26958445459604263, "rewards/argmax_reward_func": 0.65625, "rewards/format_reward_func": 0.19687500223517418, "step": 334 }, { "completion_length": 634.703125, "epoch": 0.695923136847572, "grad_norm": 0.09521856158971786, "kl": 0.3115619271993637, "learning_rate": 7.356531307999843e-05, "loss": 0.0002, "reward": 0.7468750476837158, "reward_std": 0.375650467351079, "rewards/argmax_reward_func": 0.546875, "rewards/format_reward_func": 0.20000000298023224, "step": 335 }, { "completion_length": 753.703125, "epoch": 0.6980005193456245, "grad_norm": 0.09729248285293579, "kl": 0.3037104904651642, "learning_rate": 7.342042203498951e-05, "loss": 0.0002, "reward": 0.8250000476837158, "reward_std": 0.3977475520223379, "rewards/argmax_reward_func": 0.625, "rewards/format_reward_func": 0.20000000298023224, "step": 336 }, { "completion_length": 674.0, "epoch": 0.700077901843677, "grad_norm": 0.09549879282712936, "kl": 0.3098057843744755, "learning_rate": 7.32752786011121e-05, "loss": 0.0002, "reward": 0.7468750439584255, "reward_std": 0.37565046921372414, "rewards/argmax_reward_func": 0.546875, "rewards/format_reward_func": 0.20000000298023224, "step": 337 }, { "completion_length": 673.1875, "epoch": 0.7021552843417295, "grad_norm": 0.08708694577217102, "kl": 0.29914069548249245, "learning_rate": 7.312988434249632e-05, "loss": 0.0001, "reward": 0.9031250476837158, "reward_std": 0.33145629800856113, "rewards/argmax_reward_func": 0.703125, "rewards/format_reward_func": 0.20000000298023224, "step": 338 }, { "completion_length": 699.65625, "epoch": 0.7042326668397819, "grad_norm": 0.09121581166982651, "kl": 0.31991639360785484, "learning_rate": 7.298424082597526e-05, "loss": 0.0002, "reward": 0.7625000439584255, "reward_std": 0.3093592096120119, "rewards/argmax_reward_func": 0.5625, "rewards/format_reward_func": 0.20000000298023224, "step": 339 }, { "completion_length": 667.546875, "epoch": 0.7063100493378344, "grad_norm": 0.08295177668333054, "kl": 0.3119734339416027, "learning_rate": 7.283834962106811e-05, "loss": 0.0002, "reward": 0.6656250394880772, "reward_std": 0.31377863325178623, "rewards/argmax_reward_func": 0.46875, "rewards/format_reward_func": 0.19687500409781933, "step": 340 }, { "completion_length": 734.140625, "epoch": 0.7083874318358868, "grad_norm": 0.07721901684999466, "kl": 0.2920740433037281, "learning_rate": 7.269221229996331e-05, "loss": 0.0001, "reward": 0.8875000476837158, "reward_std": 0.30935920774936676, "rewards/argmax_reward_func": 0.6875, "rewards/format_reward_func": 0.20000000298023224, "step": 341 }, { "completion_length": 723.78125, "epoch": 0.7104648143339393, "grad_norm": 0.07446262985467911, "kl": 0.31096627190709114, "learning_rate": 7.254583043750151e-05, "loss": 0.0002, "reward": 0.8093750476837158, "reward_std": 0.24306795001029968, "rewards/argmax_reward_func": 0.609375, "rewards/format_reward_func": 0.20000000298023224, "step": 342 }, { "completion_length": 598.140625, "epoch": 0.7125421968319917, "grad_norm": 0.07494507730007172, "kl": 0.3281702548265457, "learning_rate": 7.239920561115867e-05, "loss": 0.0002, "reward": 0.7000000476837158, "reward_std": 0.2209708634763956, "rewards/argmax_reward_func": 0.5, "rewards/format_reward_func": 0.20000000298023224, "step": 343 }, { "completion_length": 669.484375, "epoch": 0.7146195793300442, "grad_norm": 0.06954500079154968, "kl": 0.29709911718964577, "learning_rate": 7.225233940102906e-05, "loss": 0.0001, "reward": 0.9343750476837158, "reward_std": 0.19887377880513668, "rewards/argmax_reward_func": 0.734375, "rewards/format_reward_func": 0.20000000298023224, "step": 344 }, { "completion_length": 738.015625, "epoch": 0.7166969618280966, "grad_norm": 0.08409620076417923, "kl": 0.3333327900618315, "learning_rate": 7.210523338980813e-05, "loss": 0.0002, "reward": 0.8398437947034836, "reward_std": 0.2883669827133417, "rewards/argmax_reward_func": 0.640625, "rewards/format_reward_func": 0.1992187537252903, "step": 345 }, { "completion_length": 596.5, "epoch": 0.7187743443261491, "grad_norm": 0.07967247068881989, "kl": 0.3089658170938492, "learning_rate": 7.195788916277565e-05, "loss": 0.0002, "reward": 0.7929687947034836, "reward_std": 0.22207571775652468, "rewards/argmax_reward_func": 0.59375, "rewards/format_reward_func": 0.19921875186264515, "step": 346 }, { "completion_length": 717.390625, "epoch": 0.7208517268242015, "grad_norm": 0.0864432230591774, "kl": 0.3028757870197296, "learning_rate": 7.181030830777837e-05, "loss": 0.0002, "reward": 0.8843750506639481, "reward_std": 0.2695844564586878, "rewards/argmax_reward_func": 0.6875, "rewards/format_reward_func": 0.19687500223517418, "step": 347 }, { "completion_length": 716.515625, "epoch": 0.722929109322254, "grad_norm": 0.07595375925302505, "kl": 0.30636318400502205, "learning_rate": 7.166249241521318e-05, "loss": 0.0002, "reward": 0.7898437976837158, "reward_std": 0.22649514116346836, "rewards/argmax_reward_func": 0.59375, "rewards/format_reward_func": 0.19609375298023224, "step": 348 }, { "completion_length": 584.3125, "epoch": 0.7250064918203064, "grad_norm": 0.10229937732219696, "kl": 0.32858528569340706, "learning_rate": 7.151444307800975e-05, "loss": 0.0002, "reward": 0.7468750439584255, "reward_std": 0.3756504710763693, "rewards/argmax_reward_func": 0.546875, "rewards/format_reward_func": 0.20000000298023224, "step": 349 }, { "completion_length": 590.1875, "epoch": 0.7270838743183589, "grad_norm": 0.07948501408100128, "kl": 0.3115417957305908, "learning_rate": 7.13661618916135e-05, "loss": 0.0002, "reward": 0.7937500439584255, "reward_std": 0.2651650346815586, "rewards/argmax_reward_func": 0.59375, "rewards/format_reward_func": 0.20000000298023224, "step": 350 }, { "completion_length": 618.46875, "epoch": 0.7291612568164113, "grad_norm": 0.06686828285455704, "kl": 0.32769910246133804, "learning_rate": 7.121765045396834e-05, "loss": 0.0002, "reward": 0.8867187947034836, "reward_std": 0.17788154655136168, "rewards/argmax_reward_func": 0.6875, "rewards/format_reward_func": 0.1992187537252903, "step": 351 }, { "completion_length": 600.5625, "epoch": 0.7312386393144638, "grad_norm": 0.07947742938995361, "kl": 0.3473210446536541, "learning_rate": 7.106891036549945e-05, "loss": 0.0002, "reward": 0.7937500439584255, "reward_std": 0.2651650384068489, "rewards/argmax_reward_func": 0.59375, "rewards/format_reward_func": 0.20000000298023224, "step": 352 }, { "completion_length": 551.78125, "epoch": 0.7333160218125162, "grad_norm": 0.04208023473620415, "kl": 0.35688477009534836, "learning_rate": 7.091994322909611e-05, "loss": 0.0002, "reward": 0.9968750476837158, "reward_std": 0.06629125960171223, "rewards/argmax_reward_func": 0.796875, "rewards/format_reward_func": 0.20000000298023224, "step": 353 }, { "completion_length": 576.609375, "epoch": 0.7353934043105687, "grad_norm": 0.06657633185386658, "kl": 0.32345687225461006, "learning_rate": 7.077075065009433e-05, "loss": 0.0002, "reward": 0.7156250476837158, "reward_std": 0.19887377694249153, "rewards/argmax_reward_func": 0.515625, "rewards/format_reward_func": 0.20000000298023224, "step": 354 }, { "completion_length": 552.921875, "epoch": 0.7374707868086211, "grad_norm": 0.0544576533138752, "kl": 0.3251136727631092, "learning_rate": 7.062133423625959e-05, "loss": 0.0002, "reward": 0.8406250439584255, "reward_std": 0.11048543266952038, "rewards/argmax_reward_func": 0.640625, "rewards/format_reward_func": 0.20000000298023224, "step": 355 }, { "completion_length": 606.703125, "epoch": 0.7395481693066736, "grad_norm": 0.07147221267223358, "kl": 0.3404123783111572, "learning_rate": 7.04716955977695e-05, "loss": 0.0002, "reward": 0.8406250476837158, "reward_std": 0.19887377880513668, "rewards/argmax_reward_func": 0.640625, "rewards/format_reward_func": 0.20000000298023224, "step": 356 }, { "completion_length": 605.28125, "epoch": 0.741625551804726, "grad_norm": 0.05363324284553528, "kl": 0.3204925172030926, "learning_rate": 7.03218363471965e-05, "loss": 0.0002, "reward": 0.9500000439584255, "reward_std": 0.13258251920342445, "rewards/argmax_reward_func": 0.75, "rewards/format_reward_func": 0.20000000298023224, "step": 357 }, { "completion_length": 563.65625, "epoch": 0.7437029343027785, "grad_norm": 0.041013430804014206, "kl": 0.3419278897345066, "learning_rate": 7.017175809949044e-05, "loss": 0.0002, "reward": 0.8562500439584255, "reward_std": 0.0883883461356163, "rewards/argmax_reward_func": 0.65625, "rewards/format_reward_func": 0.20000000298023224, "step": 358 }, { "completion_length": 557.15625, "epoch": 0.7457803168008309, "grad_norm": 0.06874032318592072, "kl": 0.3553139455616474, "learning_rate": 7.002146247196113e-05, "loss": 0.0002, "reward": 0.776562537997961, "reward_std": 0.1568893138319254, "rewards/argmax_reward_func": 0.578125, "rewards/format_reward_func": 0.1984375026077032, "step": 359 }, { "completion_length": 567.15625, "epoch": 0.7478576992988835, "grad_norm": 0.0703793615102768, "kl": 0.33949872851371765, "learning_rate": 6.987095108426101e-05, "loss": 0.0002, "reward": 0.7312500402331352, "reward_std": 0.22097086533904076, "rewards/argmax_reward_func": 0.53125, "rewards/format_reward_func": 0.20000000298023224, "step": 360 }, { "completion_length": 552.984375, "epoch": 0.7499350817969359, "grad_norm": 0.06514879316091537, "kl": 0.3468449302017689, "learning_rate": 6.972022555836764e-05, "loss": 0.0002, "reward": 0.8250000476837158, "reward_std": 0.17677669040858746, "rewards/argmax_reward_func": 0.625, "rewards/format_reward_func": 0.20000000298023224, "step": 361 }, { "completion_length": 540.453125, "epoch": 0.7520124642949884, "grad_norm": 0.08254203200340271, "kl": 0.36483363062143326, "learning_rate": 6.956928751856623e-05, "loss": 0.0002, "reward": 0.8093750439584255, "reward_std": 0.24306795001029968, "rewards/argmax_reward_func": 0.609375, "rewards/format_reward_func": 0.20000000298023224, "step": 362 }, { "completion_length": 553.078125, "epoch": 0.7540898467930408, "grad_norm": 0.08209247887134552, "kl": 0.37783167138695717, "learning_rate": 6.94181385914321e-05, "loss": 0.0002, "reward": 0.667187537997961, "reward_std": 0.17898640409111977, "rewards/argmax_reward_func": 0.46875, "rewards/format_reward_func": 0.1984375026077032, "step": 363 }, { "completion_length": 610.234375, "epoch": 0.7561672292910933, "grad_norm": 0.062447499483823776, "kl": 0.34449223801493645, "learning_rate": 6.926678040581323e-05, "loss": 0.0002, "reward": 0.7468750439584255, "reward_std": 0.15467960573732853, "rewards/argmax_reward_func": 0.546875, "rewards/format_reward_func": 0.20000000298023224, "step": 364 }, { "completion_length": 527.78125, "epoch": 0.7582446117891457, "grad_norm": 0.07009898126125336, "kl": 0.36786164715886116, "learning_rate": 6.911521459281265e-05, "loss": 0.0002, "reward": 0.8250000476837158, "reward_std": 0.1767766922712326, "rewards/argmax_reward_func": 0.625, "rewards/format_reward_func": 0.20000000298023224, "step": 365 }, { "completion_length": 587.296875, "epoch": 0.7603219942871982, "grad_norm": 0.06519950181245804, "kl": 0.35170425847172737, "learning_rate": 6.896344278577083e-05, "loss": 0.0002, "reward": 0.9031250476837158, "reward_std": 0.15467960573732853, "rewards/argmax_reward_func": 0.703125, "rewards/format_reward_func": 0.20000000298023224, "step": 366 }, { "completion_length": 583.96875, "epoch": 0.7623993767852506, "grad_norm": 0.051437534391880035, "kl": 0.34650370851159096, "learning_rate": 6.881146662024822e-05, "loss": 0.0002, "reward": 1.0593750476837158, "reward_std": 0.11048543080687523, "rewards/argmax_reward_func": 0.859375, "rewards/format_reward_func": 0.20000000298023224, "step": 367 }, { "completion_length": 538.484375, "epoch": 0.764476759283303, "grad_norm": 0.06923159956932068, "kl": 0.38169170916080475, "learning_rate": 6.865928773400743e-05, "loss": 0.0002, "reward": 0.8250000439584255, "reward_std": 0.17677669040858746, "rewards/argmax_reward_func": 0.625, "rewards/format_reward_func": 0.20000000298023224, "step": 368 }, { "completion_length": 580.03125, "epoch": 0.7665541417813555, "grad_norm": 0.0665920302271843, "kl": 0.3636031821370125, "learning_rate": 6.850690776699573e-05, "loss": 0.0002, "reward": 0.7289062924683094, "reward_std": 0.13589708344079554, "rewards/argmax_reward_func": 0.53125, "rewards/format_reward_func": 0.19765625149011612, "step": 369 }, { "completion_length": 562.015625, "epoch": 0.768631524279408, "grad_norm": 0.06946459412574768, "kl": 0.4947234131395817, "learning_rate": 6.835432836132731e-05, "loss": 0.0002, "reward": 0.8093750476837158, "reward_std": 0.19887377880513668, "rewards/argmax_reward_func": 0.609375, "rewards/format_reward_func": 0.20000000298023224, "step": 370 }, { "completion_length": 575.0625, "epoch": 0.7707089067774604, "grad_norm": 0.0689174011349678, "kl": 0.3747940734028816, "learning_rate": 6.820155116126561e-05, "loss": 0.0002, "reward": 0.8406250476837158, "reward_std": 0.19887377694249153, "rewards/argmax_reward_func": 0.640625, "rewards/format_reward_func": 0.20000000298023224, "step": 371 }, { "completion_length": 571.109375, "epoch": 0.7727862892755128, "grad_norm": 0.08710569888353348, "kl": 0.39623570069670677, "learning_rate": 6.804857781320558e-05, "loss": 0.0002, "reward": 0.7464844174683094, "reward_std": 0.28670969791710377, "rewards/argmax_reward_func": 0.546875, "rewards/format_reward_func": 0.19960937835276127, "step": 372 }, { "completion_length": 607.59375, "epoch": 0.7748636717735653, "grad_norm": 0.0731528028845787, "kl": 0.3582250289618969, "learning_rate": 6.789540996565593e-05, "loss": 0.0002, "reward": 0.8718750476837158, "reward_std": 0.19887377694249153, "rewards/argmax_reward_func": 0.671875, "rewards/format_reward_func": 0.20000000298023224, "step": 373 }, { "completion_length": 579.28125, "epoch": 0.7769410542716177, "grad_norm": 0.0625411793589592, "kl": 0.3635551296174526, "learning_rate": 6.774204926922145e-05, "loss": 0.0002, "reward": 0.8875000439584255, "reward_std": 0.17677669040858746, "rewards/argmax_reward_func": 0.6875, "rewards/format_reward_func": 0.20000000298023224, "step": 374 }, { "completion_length": 599.3125, "epoch": 0.7790184367696702, "grad_norm": 0.08092815428972244, "kl": 0.4265919253230095, "learning_rate": 6.758849737658509e-05, "loss": 0.0002, "reward": 0.8718750476837158, "reward_std": 0.24306795001029968, "rewards/argmax_reward_func": 0.671875, "rewards/format_reward_func": 0.20000000298023224, "step": 375 }, { "completion_length": 547.796875, "epoch": 0.7810958192677226, "grad_norm": 0.07175435870885849, "kl": 0.3616880625486374, "learning_rate": 6.743475594249021e-05, "loss": 0.0002, "reward": 0.8843750432133675, "reward_std": 0.18119611218571663, "rewards/argmax_reward_func": 0.6875, "rewards/format_reward_func": 0.19687500223517418, "step": 376 }, { "completion_length": 591.609375, "epoch": 0.7831732017657751, "grad_norm": 0.07784335315227509, "kl": 0.42067378014326096, "learning_rate": 6.728082662372282e-05, "loss": 0.0002, "reward": 0.8406250476837158, "reward_std": 0.24306795001029968, "rewards/argmax_reward_func": 0.640625, "rewards/format_reward_func": 0.20000000298023224, "step": 377 }, { "completion_length": 531.859375, "epoch": 0.7852505842638275, "grad_norm": 0.07652134448289871, "kl": 0.3934118077158928, "learning_rate": 6.712671107909359e-05, "loss": 0.0002, "reward": 0.8875000439584255, "reward_std": 0.1767766922712326, "rewards/argmax_reward_func": 0.6875, "rewards/format_reward_func": 0.20000000298023224, "step": 378 }, { "completion_length": 542.640625, "epoch": 0.78732796676188, "grad_norm": 0.054136764258146286, "kl": 0.4239979311823845, "learning_rate": 6.697241096942006e-05, "loss": 0.0002, "reward": 0.8562500439584255, "reward_std": 0.0883883461356163, "rewards/argmax_reward_func": 0.65625, "rewards/format_reward_func": 0.20000000298023224, "step": 379 }, { "completion_length": 548.53125, "epoch": 0.7894053492599324, "grad_norm": 0.07836976647377014, "kl": 0.4376937076449394, "learning_rate": 6.681792795750875e-05, "loss": 0.0002, "reward": 0.7308594211935997, "reward_std": 0.17732911929488182, "rewards/argmax_reward_func": 0.53125, "rewards/format_reward_func": 0.19960937649011612, "step": 380 }, { "completion_length": 545.140625, "epoch": 0.7914827317579849, "grad_norm": 0.06161171570420265, "kl": 0.526831716299057, "learning_rate": 6.666326370813723e-05, "loss": 0.0003, "reward": 0.8562500476837158, "reward_std": 0.13258251920342445, "rewards/argmax_reward_func": 0.65625, "rewards/format_reward_func": 0.20000000298023224, "step": 381 }, { "completion_length": 539.4375, "epoch": 0.7935601142560373, "grad_norm": 0.050724372267723083, "kl": 0.4309442602097988, "learning_rate": 6.650841988803606e-05, "loss": 0.0002, "reward": 0.8875000476837158, "reward_std": 0.0883883461356163, "rewards/argmax_reward_func": 0.6875, "rewards/format_reward_func": 0.20000000298023224, "step": 382 }, { "completion_length": 515.109375, "epoch": 0.7956374967540899, "grad_norm": 0.08242635428905487, "kl": 0.4333142638206482, "learning_rate": 6.635339816587109e-05, "loss": 0.0002, "reward": 0.8718750476837158, "reward_std": 0.24306795187294483, "rewards/argmax_reward_func": 0.671875, "rewards/format_reward_func": 0.20000000298023224, "step": 383 }, { "completion_length": 582.0, "epoch": 0.7977148792521424, "grad_norm": 0.07576624304056168, "kl": 0.40080199763178825, "learning_rate": 6.619820021222518e-05, "loss": 0.0002, "reward": 0.8406250439584255, "reward_std": 0.19887377694249153, "rewards/argmax_reward_func": 0.640625, "rewards/format_reward_func": 0.20000000298023224, "step": 384 }, { "completion_length": 563.59375, "epoch": 0.7997922617501948, "grad_norm": 0.08377435803413391, "kl": 0.43326959386467934, "learning_rate": 6.604282769958044e-05, "loss": 0.0002, "reward": 0.8089844211935997, "reward_std": 0.2436203770339489, "rewards/argmax_reward_func": 0.609375, "rewards/format_reward_func": 0.19960937649011612, "step": 385 }, { "completion_length": 548.859375, "epoch": 0.8018696442482472, "grad_norm": 0.09505198895931244, "kl": 0.625109825283289, "learning_rate": 6.588728230230004e-05, "loss": 0.0003, "reward": 0.7933594211935997, "reward_std": 0.3088067825883627, "rewards/argmax_reward_func": 0.59375, "rewards/format_reward_func": 0.19960937649011612, "step": 386 }, { "completion_length": 534.046875, "epoch": 0.8039470267462997, "grad_norm": 0.09738834947347641, "kl": 0.5551509782671928, "learning_rate": 6.573156569661025e-05, "loss": 0.0003, "reward": 0.8703125491738319, "reward_std": 0.289471834897995, "rewards/argmax_reward_func": 0.671875, "rewards/format_reward_func": 0.1984375026077032, "step": 387 }, { "completion_length": 557.40625, "epoch": 0.8060244092443521, "grad_norm": 0.0654783695936203, "kl": 0.42286501079797745, "learning_rate": 6.557567956058239e-05, "loss": 0.0002, "reward": 0.8093750476837158, "reward_std": 0.15467960573732853, "rewards/argmax_reward_func": 0.609375, "rewards/format_reward_func": 0.20000000298023224, "step": 388 }, { "completion_length": 509.921875, "epoch": 0.8081017917424046, "grad_norm": 0.07516364008188248, "kl": 0.5226034559309483, "learning_rate": 6.541962557411469e-05, "loss": 0.0003, "reward": 0.8250000476837158, "reward_std": 0.17677669040858746, "rewards/argmax_reward_func": 0.625, "rewards/format_reward_func": 0.20000000298023224, "step": 389 }, { "completion_length": 540.953125, "epoch": 0.810179174240457, "grad_norm": 0.08237718045711517, "kl": 0.49187011271715164, "learning_rate": 6.526340541891418e-05, "loss": 0.0002, "reward": 0.7937500476837158, "reward_std": 0.2209708634763956, "rewards/argmax_reward_func": 0.59375, "rewards/format_reward_func": 0.20000000298023224, "step": 390 }, { "completion_length": 538.03125, "epoch": 0.8122565567385095, "grad_norm": 0.08174508810043335, "kl": 0.45481956005096436, "learning_rate": 6.510702077847863e-05, "loss": 0.0002, "reward": 0.7625000439584255, "reward_std": 0.26516503654420376, "rewards/argmax_reward_func": 0.5625, "rewards/format_reward_func": 0.20000000298023224, "step": 391 }, { "completion_length": 552.703125, "epoch": 0.8143339392365619, "grad_norm": 0.09640171378850937, "kl": 0.45114999637007713, "learning_rate": 6.495047333807842e-05, "loss": 0.0002, "reward": 0.7621094211935997, "reward_std": 0.309911634773016, "rewards/argmax_reward_func": 0.5625, "rewards/format_reward_func": 0.19960937835276127, "step": 392 }, { "completion_length": 555.75, "epoch": 0.8164113217346144, "grad_norm": 0.07019418478012085, "kl": 0.44265756756067276, "learning_rate": 6.479376478473823e-05, "loss": 0.0002, "reward": 0.9500000476837158, "reward_std": 0.1767766922712326, "rewards/argmax_reward_func": 0.75, "rewards/format_reward_func": 0.20000000298023224, "step": 393 }, { "completion_length": 588.09375, "epoch": 0.8184887042326668, "grad_norm": 0.05258520692586899, "kl": 0.4588502533733845, "learning_rate": 6.463689680721904e-05, "loss": 0.0002, "reward": 0.8718750439584255, "reward_std": 0.11048543266952038, "rewards/argmax_reward_func": 0.671875, "rewards/format_reward_func": 0.20000000298023224, "step": 394 }, { "completion_length": 527.578125, "epoch": 0.8205660867307193, "grad_norm": 0.08728921413421631, "kl": 0.44911035895347595, "learning_rate": 6.447987109599986e-05, "loss": 0.0002, "reward": 0.7937500476837158, "reward_std": 0.26516503654420376, "rewards/argmax_reward_func": 0.59375, "rewards/format_reward_func": 0.20000000298023224, "step": 395 }, { "completion_length": 635.40625, "epoch": 0.8226434692287717, "grad_norm": 0.06634779274463654, "kl": 0.38350560516119003, "learning_rate": 6.432268934325946e-05, "loss": 0.0002, "reward": 0.8875000476837158, "reward_std": 0.17677669040858746, "rewards/argmax_reward_func": 0.6875, "rewards/format_reward_func": 0.20000000298023224, "step": 396 }, { "completion_length": 532.09375, "epoch": 0.8247208517268242, "grad_norm": 0.09231170266866684, "kl": 0.46880777925252914, "learning_rate": 6.416535324285824e-05, "loss": 0.0002, "reward": 0.6843750402331352, "reward_std": 0.2872621212154627, "rewards/argmax_reward_func": 0.484375, "rewards/format_reward_func": 0.20000000298023224, "step": 397 }, { "completion_length": 588.875, "epoch": 0.8267982342248766, "grad_norm": 0.07496833801269531, "kl": 0.850627463310957, "learning_rate": 6.400786449031986e-05, "loss": 0.0004, "reward": 0.8875000476837158, "reward_std": 0.13258251920342445, "rewards/argmax_reward_func": 0.6875, "rewards/format_reward_func": 0.20000000298023224, "step": 398 }, { "completion_length": 511.953125, "epoch": 0.8288756167229291, "grad_norm": 0.06271515041589737, "kl": 0.4049219489097595, "learning_rate": 6.385022478281306e-05, "loss": 0.0002, "reward": 0.8875000476837158, "reward_std": 0.13258251920342445, "rewards/argmax_reward_func": 0.6875, "rewards/format_reward_func": 0.20000000298023224, "step": 399 }, { "completion_length": 555.046875, "epoch": 0.8309529992209815, "grad_norm": 0.07291208207607269, "kl": 0.4224717430770397, "learning_rate": 6.369243581913336e-05, "loss": 0.0002, "reward": 0.7781250439584255, "reward_std": 0.19887377880513668, "rewards/argmax_reward_func": 0.578125, "rewards/format_reward_func": 0.20000000298023224, "step": 400 }, { "completion_length": 573.515625, "epoch": 0.833030381719034, "grad_norm": 0.06133547052741051, "kl": 0.42930199950933456, "learning_rate": 6.353449929968465e-05, "loss": 0.0002, "reward": 0.7781250439584255, "reward_std": 0.15467960573732853, "rewards/argmax_reward_func": 0.578125, "rewards/format_reward_func": 0.20000000298023224, "step": 401 }, { "completion_length": 522.8125, "epoch": 0.8351077642170864, "grad_norm": 0.06863158941268921, "kl": 0.4221891984343529, "learning_rate": 6.337641692646106e-05, "loss": 0.0002, "reward": 0.9019531756639481, "reward_std": 0.15633688867092133, "rewards/argmax_reward_func": 0.703125, "rewards/format_reward_func": 0.19882812723517418, "step": 402 }, { "completion_length": 544.0, "epoch": 0.8371851467151389, "grad_norm": 0.07222079485654831, "kl": 0.45428359508514404, "learning_rate": 6.321819040302839e-05, "loss": 0.0002, "reward": 0.8093750476837158, "reward_std": 0.19887377880513668, "rewards/argmax_reward_func": 0.609375, "rewards/format_reward_func": 0.20000000298023224, "step": 403 }, { "completion_length": 527.671875, "epoch": 0.8392625292131913, "grad_norm": 0.08342251926660538, "kl": 0.4082505330443382, "learning_rate": 6.305982143450597e-05, "loss": 0.0002, "reward": 0.8402344286441803, "reward_std": 0.24362037930404767, "rewards/argmax_reward_func": 0.640625, "rewards/format_reward_func": 0.19960937649011612, "step": 404 }, { "completion_length": 544.171875, "epoch": 0.8413399117112438, "grad_norm": 0.064121775329113, "kl": 0.43093303963541985, "learning_rate": 6.290131172754811e-05, "loss": 0.0002, "reward": 0.9949219226837158, "reward_std": 0.15744174271821976, "rewards/argmax_reward_func": 0.796875, "rewards/format_reward_func": 0.19804687798023224, "step": 405 }, { "completion_length": 517.578125, "epoch": 0.8434172942092963, "grad_norm": 0.08640465885400772, "kl": 0.44819287210702896, "learning_rate": 6.274266299032582e-05, "loss": 0.0002, "reward": 0.8250000476837158, "reward_std": 0.26516503654420376, "rewards/argmax_reward_func": 0.625, "rewards/format_reward_func": 0.20000000298023224, "step": 406 }, { "completion_length": 536.734375, "epoch": 0.8454946767073488, "grad_norm": 0.09182075411081314, "kl": 0.39375099167227745, "learning_rate": 6.25838769325083e-05, "loss": 0.0002, "reward": 0.7625000476837158, "reward_std": 0.3093592096120119, "rewards/argmax_reward_func": 0.5625, "rewards/format_reward_func": 0.20000000298023224, "step": 407 }, { "completion_length": 537.265625, "epoch": 0.8475720592054012, "grad_norm": 0.05770527943968773, "kl": 0.48194558918476105, "learning_rate": 6.24249552652447e-05, "loss": 0.0002, "reward": 0.7292969226837158, "reward_std": 0.13534465618431568, "rewards/argmax_reward_func": 0.53125, "rewards/format_reward_func": 0.19804687798023224, "step": 408 }, { "completion_length": 536.84375, "epoch": 0.8496494417034537, "grad_norm": 0.07029449939727783, "kl": 0.4273468554019928, "learning_rate": 6.226589970114543e-05, "loss": 0.0002, "reward": 0.8406250439584255, "reward_std": 0.19887377694249153, "rewards/argmax_reward_func": 0.640625, "rewards/format_reward_func": 0.20000000298023224, "step": 409 }, { "completion_length": 539.140625, "epoch": 0.8517268242015061, "grad_norm": 0.0789664089679718, "kl": 0.4228878915309906, "learning_rate": 6.210671195426387e-05, "loss": 0.0002, "reward": 0.8093750476837158, "reward_std": 0.19887377694249153, "rewards/argmax_reward_func": 0.609375, "rewards/format_reward_func": 0.20000000298023224, "step": 410 }, { "completion_length": 582.265625, "epoch": 0.8538042066995586, "grad_norm": 0.05472075939178467, "kl": 0.39347052946686745, "learning_rate": 6.194739374007792e-05, "loss": 0.0002, "reward": 0.8562500439584255, "reward_std": 0.13258251920342445, "rewards/argmax_reward_func": 0.65625, "rewards/format_reward_func": 0.20000000298023224, "step": 411 }, { "completion_length": 511.125, "epoch": 0.855881589197611, "grad_norm": 0.08022020757198334, "kl": 0.44926824048161507, "learning_rate": 6.178794677547137e-05, "loss": 0.0002, "reward": 0.7312500439584255, "reward_std": 0.22097086533904076, "rewards/argmax_reward_func": 0.53125, "rewards/format_reward_func": 0.20000000298023224, "step": 412 }, { "completion_length": 502.40625, "epoch": 0.8579589716956635, "grad_norm": 0.08022835850715637, "kl": 0.4819503165781498, "learning_rate": 6.162837277871553e-05, "loss": 0.0002, "reward": 0.8718750476837158, "reward_std": 0.24306794814765453, "rewards/argmax_reward_func": 0.671875, "rewards/format_reward_func": 0.20000000298023224, "step": 413 }, { "completion_length": 537.1875, "epoch": 0.8600363541937159, "grad_norm": 0.06297382712364197, "kl": 0.49380555003881454, "learning_rate": 6.146867346945066e-05, "loss": 0.0002, "reward": 0.8875000476837158, "reward_std": 0.1767766922712326, "rewards/argmax_reward_func": 0.6875, "rewards/format_reward_func": 0.20000000298023224, "step": 414 }, { "completion_length": 521.859375, "epoch": 0.8621137366917684, "grad_norm": 0.07238580286502838, "kl": 0.4831845983862877, "learning_rate": 6.130885056866742e-05, "loss": 0.0002, "reward": 0.8718750476837158, "reward_std": 0.19887377880513668, "rewards/argmax_reward_func": 0.671875, "rewards/format_reward_func": 0.20000000298023224, "step": 415 }, { "completion_length": 538.84375, "epoch": 0.8641911191898208, "grad_norm": 0.070571668446064, "kl": 0.5007887817919254, "learning_rate": 6.114890579868837e-05, "loss": 0.0003, "reward": 0.8250000439584255, "reward_std": 0.22097086533904076, "rewards/argmax_reward_func": 0.625, "rewards/format_reward_func": 0.20000000298023224, "step": 416 }, { "completion_length": 573.25, "epoch": 0.8662685016878733, "grad_norm": 0.0768335610628128, "kl": 0.4633421525359154, "learning_rate": 6.098884088314938e-05, "loss": 0.0002, "reward": 0.8875000439584255, "reward_std": 0.22097086533904076, "rewards/argmax_reward_func": 0.6875, "rewards/format_reward_func": 0.20000000298023224, "step": 417 }, { "completion_length": 627.6875, "epoch": 0.8683458841859257, "grad_norm": 0.07244177162647247, "kl": 0.45967796072363853, "learning_rate": 6.082865754698109e-05, "loss": 0.0002, "reward": 0.9187500476837158, "reward_std": 0.2209708634763956, "rewards/argmax_reward_func": 0.71875, "rewards/format_reward_func": 0.20000000298023224, "step": 418 }, { "completion_length": 558.171875, "epoch": 0.8704232666839782, "grad_norm": 0.07271222770214081, "kl": 0.46447786316275597, "learning_rate": 6.066835751639022e-05, "loss": 0.0002, "reward": 0.7925781682133675, "reward_std": 0.22152329608798027, "rewards/argmax_reward_func": 0.59375, "rewards/format_reward_func": 0.19882812909781933, "step": 419 }, { "completion_length": 539.90625, "epoch": 0.8725006491820306, "grad_norm": 0.05372535437345505, "kl": 0.47306570410728455, "learning_rate": 6.050794251884112e-05, "loss": 0.0002, "reward": 0.9031250476837158, "reward_std": 0.11048543266952038, "rewards/argmax_reward_func": 0.703125, "rewards/format_reward_func": 0.20000000298023224, "step": 420 }, { "completion_length": 579.59375, "epoch": 0.8745780316800831, "grad_norm": 0.06956978142261505, "kl": 0.4903941936790943, "learning_rate": 6.0347414283037004e-05, "loss": 0.0002, "reward": 0.7937500476837158, "reward_std": 0.17677669040858746, "rewards/argmax_reward_func": 0.59375, "rewards/format_reward_func": 0.20000000298023224, "step": 421 }, { "completion_length": 543.5625, "epoch": 0.8766554141781355, "grad_norm": 0.05839576572179794, "kl": 0.46378039941191673, "learning_rate": 6.018677453890149e-05, "loss": 0.0002, "reward": 0.8093750439584255, "reward_std": 0.15467960573732853, "rewards/argmax_reward_func": 0.609375, "rewards/format_reward_func": 0.20000000298023224, "step": 422 }, { "completion_length": 571.265625, "epoch": 0.878732796676188, "grad_norm": 0.08274129778146744, "kl": 0.4939221628010273, "learning_rate": 6.002602501755974e-05, "loss": 0.0002, "reward": 0.9656250476837158, "reward_std": 0.24306795001029968, "rewards/argmax_reward_func": 0.765625, "rewards/format_reward_func": 0.20000000298023224, "step": 423 }, { "completion_length": 562.875, "epoch": 0.8808101791742404, "grad_norm": 0.052250444889068604, "kl": 0.4885864891111851, "learning_rate": 5.9865167451320005e-05, "loss": 0.0002, "reward": 0.9031250439584255, "reward_std": 0.11048543266952038, "rewards/argmax_reward_func": 0.703125, "rewards/format_reward_func": 0.20000000298023224, "step": 424 }, { "completion_length": 564.171875, "epoch": 0.8828875616722929, "grad_norm": 0.07596340775489807, "kl": 0.4957350380718708, "learning_rate": 5.970420357365486e-05, "loss": 0.0002, "reward": 0.6843750439584255, "reward_std": 0.24306794814765453, "rewards/argmax_reward_func": 0.484375, "rewards/format_reward_func": 0.20000000298023224, "step": 425 }, { "completion_length": 563.375, "epoch": 0.8849649441703453, "grad_norm": 0.0591680072247982, "kl": 0.47624582052230835, "learning_rate": 5.9543135119182514e-05, "loss": 0.0002, "reward": 0.7757812812924385, "reward_std": 0.11269513890147209, "rewards/argmax_reward_func": 0.578125, "rewards/format_reward_func": 0.19765625335276127, "step": 426 }, { "completion_length": 607.78125, "epoch": 0.8870423266683978, "grad_norm": 0.0683642029762268, "kl": 0.5224468521773815, "learning_rate": 5.938196382364818e-05, "loss": 0.0003, "reward": 0.8718750439584255, "reward_std": 0.19887377880513668, "rewards/argmax_reward_func": 0.671875, "rewards/format_reward_func": 0.20000000298023224, "step": 427 }, { "completion_length": 584.8125, "epoch": 0.8891197091664502, "grad_norm": 0.05711337924003601, "kl": 0.4908281937241554, "learning_rate": 5.9220691423905305e-05, "loss": 0.0002, "reward": 0.9187500476837158, "reward_std": 0.1325825173407793, "rewards/argmax_reward_func": 0.71875, "rewards/format_reward_func": 0.20000000298023224, "step": 428 }, { "completion_length": 626.34375, "epoch": 0.8911970916645027, "grad_norm": 0.060382284224033356, "kl": 0.586872935295105, "learning_rate": 5.9059319657896884e-05, "loss": 0.0003, "reward": 0.9343750476837158, "reward_std": 0.15467960573732853, "rewards/argmax_reward_func": 0.734375, "rewards/format_reward_func": 0.20000000298023224, "step": 429 }, { "completion_length": 597.703125, "epoch": 0.8932744741625552, "grad_norm": 0.07129113376140594, "kl": 0.6612692400813103, "learning_rate": 5.889785026463672e-05, "loss": 0.0003, "reward": 0.8554687947034836, "reward_std": 0.17788155190646648, "rewards/argmax_reward_func": 0.65625, "rewards/format_reward_func": 0.19921875186264515, "step": 430 }, { "completion_length": 568.046875, "epoch": 0.8953518566606077, "grad_norm": 0.07546839118003845, "kl": 0.5523902028799057, "learning_rate": 5.873628498419073e-05, "loss": 0.0003, "reward": 0.8250000476837158, "reward_std": 0.22097086533904076, "rewards/argmax_reward_func": 0.625, "rewards/format_reward_func": 0.20000000298023224, "step": 431 }, { "completion_length": 573.609375, "epoch": 0.8974292391586601, "grad_norm": 0.06730344146490097, "kl": 0.5382697433233261, "learning_rate": 5.8574625557658095e-05, "loss": 0.0003, "reward": 0.7312500402331352, "reward_std": 0.17677669040858746, "rewards/argmax_reward_func": 0.53125, "rewards/format_reward_func": 0.20000000298023224, "step": 432 }, { "completion_length": 576.84375, "epoch": 0.8995066216567126, "grad_norm": 0.04371188208460808, "kl": 0.49328725039958954, "learning_rate": 5.8412873727152595e-05, "loss": 0.0002, "reward": 0.8875000439584255, "reward_std": 0.0883883461356163, "rewards/argmax_reward_func": 0.6875, "rewards/format_reward_func": 0.20000000298023224, "step": 433 }, { "completion_length": 634.609375, "epoch": 0.901584004154765, "grad_norm": 0.05455589294433594, "kl": 0.4651510939002037, "learning_rate": 5.825103123578379e-05, "loss": 0.0002, "reward": 0.7468750439584255, "reward_std": 0.11048543266952038, "rewards/argmax_reward_func": 0.546875, "rewards/format_reward_func": 0.20000000298023224, "step": 434 }, { "completion_length": 607.640625, "epoch": 0.9036613866528175, "grad_norm": 0.06638182699680328, "kl": 0.4994208887219429, "learning_rate": 5.808909982763825e-05, "loss": 0.0002, "reward": 0.8875000476837158, "reward_std": 0.17677669040858746, "rewards/argmax_reward_func": 0.6875, "rewards/format_reward_func": 0.20000000298023224, "step": 435 }, { "completion_length": 577.171875, "epoch": 0.9057387691508699, "grad_norm": 0.05038674548268318, "kl": 0.4841819517314434, "learning_rate": 5.792708124776072e-05, "loss": 0.0002, "reward": 0.9031250476837158, "reward_std": 0.11048543266952038, "rewards/argmax_reward_func": 0.703125, "rewards/format_reward_func": 0.20000000298023224, "step": 436 }, { "completion_length": 618.765625, "epoch": 0.9078161516489224, "grad_norm": 0.06103122606873512, "kl": 0.45625371113419533, "learning_rate": 5.776497724213536e-05, "loss": 0.0002, "reward": 0.8406250476837158, "reward_std": 0.15467960573732853, "rewards/argmax_reward_func": 0.640625, "rewards/format_reward_func": 0.20000000298023224, "step": 437 }, { "completion_length": 577.34375, "epoch": 0.9098935341469748, "grad_norm": 0.06659764796495438, "kl": 0.5014519467949867, "learning_rate": 5.760278955766695e-05, "loss": 0.0003, "reward": 0.7937500439584255, "reward_std": 0.1767766922712326, "rewards/argmax_reward_func": 0.59375, "rewards/format_reward_func": 0.20000000298023224, "step": 438 }, { "completion_length": 588.828125, "epoch": 0.9119709166450273, "grad_norm": 0.06549356877803802, "kl": 0.4932373948395252, "learning_rate": 5.744051994216201e-05, "loss": 0.0002, "reward": 0.7781250476837158, "reward_std": 0.15467960573732853, "rewards/argmax_reward_func": 0.578125, "rewards/format_reward_func": 0.20000000298023224, "step": 439 }, { "completion_length": 587.828125, "epoch": 0.9140482991430797, "grad_norm": 0.08965161442756653, "kl": 0.4944054037332535, "learning_rate": 5.727817014430992e-05, "loss": 0.0002, "reward": 0.7777344174683094, "reward_std": 0.28781455382704735, "rewards/argmax_reward_func": 0.578125, "rewards/format_reward_func": 0.19960937649011612, "step": 440 }, { "completion_length": 580.125, "epoch": 0.9161256816411322, "grad_norm": 0.07723158597946167, "kl": 0.4910140074789524, "learning_rate": 5.7115741913664264e-05, "loss": 0.0002, "reward": 0.8406250476837158, "reward_std": 0.24306795187294483, "rewards/argmax_reward_func": 0.640625, "rewards/format_reward_func": 0.20000000298023224, "step": 441 }, { "completion_length": 598.890625, "epoch": 0.9182030641391846, "grad_norm": 0.068883016705513, "kl": 0.481427326798439, "learning_rate": 5.695323700062375e-05, "loss": 0.0002, "reward": 0.7000000439584255, "reward_std": 0.1767766922712326, "rewards/argmax_reward_func": 0.5, "rewards/format_reward_func": 0.20000000298023224, "step": 442 }, { "completion_length": 600.140625, "epoch": 0.9202804466372371, "grad_norm": 0.07069353759288788, "kl": 0.5068237520754337, "learning_rate": 5.6790657156413504e-05, "loss": 0.0003, "reward": 0.714843787252903, "reward_std": 0.1999786365777254, "rewards/argmax_reward_func": 0.515625, "rewards/format_reward_func": 0.19921875186264515, "step": 443 }, { "completion_length": 587.625, "epoch": 0.9223578291352895, "grad_norm": 1.0064716339111328, "kl": 9.217760100960732, "learning_rate": 5.66280041330661e-05, "loss": 0.0046, "reward": 0.7781250439584255, "reward_std": 0.15467960573732853, "rewards/argmax_reward_func": 0.578125, "rewards/format_reward_func": 0.20000000298023224, "step": 444 }, { "completion_length": 590.78125, "epoch": 0.924435211633342, "grad_norm": 0.2128666639328003, "kl": 3.3808604292571545, "learning_rate": 5.646527968340278e-05, "loss": 0.0017, "reward": 0.7625000476837158, "reward_std": 0.26516503654420376, "rewards/argmax_reward_func": 0.5625, "rewards/format_reward_func": 0.20000000298023224, "step": 445 }, { "completion_length": 633.46875, "epoch": 0.9265125941313944, "grad_norm": 0.05456709861755371, "kl": 0.49203020706772804, "learning_rate": 5.6302485561014475e-05, "loss": 0.0002, "reward": 0.9500000439584255, "reward_std": 0.13258251920342445, "rewards/argmax_reward_func": 0.75, "rewards/format_reward_func": 0.20000000298023224, "step": 446 }, { "completion_length": 584.015625, "epoch": 0.9285899766294469, "grad_norm": 0.06649811565876007, "kl": 0.47326431795954704, "learning_rate": 5.613962352024292e-05, "loss": 0.0002, "reward": 0.7625000439584255, "reward_std": 0.1767766922712326, "rewards/argmax_reward_func": 0.5625, "rewards/format_reward_func": 0.20000000298023224, "step": 447 }, { "completion_length": 657.1875, "epoch": 0.9306673591274993, "grad_norm": 0.08579502999782562, "kl": 0.4674902521073818, "learning_rate": 5.597669531616181e-05, "loss": 0.0002, "reward": 0.8093750439584255, "reward_std": 0.331456296145916, "rewards/argmax_reward_func": 0.609375, "rewards/format_reward_func": 0.20000000298023224, "step": 448 }, { "completion_length": 664.71875, "epoch": 0.9327447416255518, "grad_norm": 0.056638821959495544, "kl": 0.470287274569273, "learning_rate": 5.5813702704557814e-05, "loss": 0.0002, "reward": 0.9187500476837158, "reward_std": 0.13258251920342445, "rewards/argmax_reward_func": 0.71875, "rewards/format_reward_func": 0.20000000298023224, "step": 449 }, { "completion_length": 618.296875, "epoch": 0.9348221241236042, "grad_norm": 0.040137626230716705, "kl": 0.46139009296894073, "learning_rate": 5.5650647441911706e-05, "loss": 0.0002, "reward": 0.9656250476837158, "reward_std": 0.06629125960171223, "rewards/argmax_reward_func": 0.765625, "rewards/format_reward_func": 0.20000000298023224, "step": 450 }, { "completion_length": 632.96875, "epoch": 0.9368995066216567, "grad_norm": 0.06284568458795547, "kl": 0.517881490290165, "learning_rate": 5.548753128537939e-05, "loss": 0.0003, "reward": 0.8718750476837158, "reward_std": 0.15467960573732853, "rewards/argmax_reward_func": 0.671875, "rewards/format_reward_func": 0.20000000298023224, "step": 451 }, { "completion_length": 677.46875, "epoch": 0.9389768891197091, "grad_norm": 0.07211080193519592, "kl": 0.46745334565639496, "learning_rate": 5.532435599277303e-05, "loss": 0.0002, "reward": 0.7781250439584255, "reward_std": 0.24306794814765453, "rewards/argmax_reward_func": 0.578125, "rewards/format_reward_func": 0.20000000298023224, "step": 452 }, { "completion_length": 697.96875, "epoch": 0.9410542716177617, "grad_norm": 0.06623782962560654, "kl": 0.4283139891922474, "learning_rate": 5.516112332254203e-05, "loss": 0.0002, "reward": 0.8250000476837158, "reward_std": 0.2209708634763956, "rewards/argmax_reward_func": 0.625, "rewards/format_reward_func": 0.20000000298023224, "step": 453 }, { "completion_length": 619.375, "epoch": 0.9431316541158141, "grad_norm": 0.08626007288694382, "kl": 0.5277771130204201, "learning_rate": 5.499783503375412e-05, "loss": 0.0003, "reward": 0.8250000476837158, "reward_std": 0.30935920774936676, "rewards/argmax_reward_func": 0.625, "rewards/format_reward_func": 0.20000000298023224, "step": 454 }, { "completion_length": 674.0, "epoch": 0.9452090366138666, "grad_norm": 0.06324354559183121, "kl": 0.4568277336657047, "learning_rate": 5.4834492886076446e-05, "loss": 0.0002, "reward": 0.8714844174683094, "reward_std": 0.19942620425717905, "rewards/argmax_reward_func": 0.671875, "rewards/format_reward_func": 0.19960937835276127, "step": 455 }, { "completion_length": 701.0625, "epoch": 0.947286419111919, "grad_norm": 0.07262270897626877, "kl": 0.4528024010360241, "learning_rate": 5.4671098639756504e-05, "loss": 0.0002, "reward": 0.8250000476837158, "reward_std": 0.22097086533904076, "rewards/argmax_reward_func": 0.625, "rewards/format_reward_func": 0.20000000298023224, "step": 456 }, { "completion_length": 693.90625, "epoch": 0.9493638016099715, "grad_norm": 0.06315562129020691, "kl": 0.43744752556085587, "learning_rate": 5.4507654055603275e-05, "loss": 0.0002, "reward": 0.8093750476837158, "reward_std": 0.19887377694249153, "rewards/argmax_reward_func": 0.609375, "rewards/format_reward_func": 0.20000000298023224, "step": 457 }, { "completion_length": 664.984375, "epoch": 0.9514411841080239, "grad_norm": 0.06270638853311539, "kl": 0.4934372082352638, "learning_rate": 5.4344160894968145e-05, "loss": 0.0002, "reward": 0.8093750476837158, "reward_std": 0.19887377694249153, "rewards/argmax_reward_func": 0.609375, "rewards/format_reward_func": 0.20000000298023224, "step": 458 }, { "completion_length": 679.78125, "epoch": 0.9535185666060764, "grad_norm": 0.06320095807313919, "kl": 0.4636671505868435, "learning_rate": 5.418062091972604e-05, "loss": 0.0002, "reward": 0.9019531756639481, "reward_std": 0.15633688890375197, "rewards/argmax_reward_func": 0.703125, "rewards/format_reward_func": 0.19882812537252903, "step": 459 }, { "completion_length": 768.765625, "epoch": 0.9555959491041288, "grad_norm": 0.08169972896575928, "kl": 0.4431908018887043, "learning_rate": 5.4017035892256365e-05, "loss": 0.0002, "reward": 0.7773437909781933, "reward_std": 0.33256115578114986, "rewards/argmax_reward_func": 0.578125, "rewards/format_reward_func": 0.19921875186264515, "step": 460 }, { "completion_length": 649.203125, "epoch": 0.9576733316021813, "grad_norm": 0.06060326099395752, "kl": 0.4794473238289356, "learning_rate": 5.385340757542402e-05, "loss": 0.0002, "reward": 0.6375000365078449, "reward_std": 0.1767766922712326, "rewards/argmax_reward_func": 0.4375, "rewards/format_reward_func": 0.20000000298023224, "step": 461 }, { "completion_length": 752.859375, "epoch": 0.9597507141002337, "grad_norm": 0.07235154509544373, "kl": 0.43572117015719414, "learning_rate": 5.36897377325604e-05, "loss": 0.0002, "reward": 0.9328125417232513, "reward_std": 0.2452776599675417, "rewards/argmax_reward_func": 0.734375, "rewards/format_reward_func": 0.1984375026077032, "step": 462 }, { "completion_length": 680.78125, "epoch": 0.9618280965982862, "grad_norm": 0.07361527532339096, "kl": 0.45767712593078613, "learning_rate": 5.352602812744441e-05, "loss": 0.0002, "reward": 0.9187500476837158, "reward_std": 0.26516503654420376, "rewards/argmax_reward_func": 0.71875, "rewards/format_reward_func": 0.20000000298023224, "step": 463 }, { "completion_length": 702.609375, "epoch": 0.9639054790963386, "grad_norm": 0.0740986093878746, "kl": 0.493575606495142, "learning_rate": 5.336228052428348e-05, "loss": 0.0002, "reward": 0.9031250476837158, "reward_std": 0.24306795001029968, "rewards/argmax_reward_func": 0.703125, "rewards/format_reward_func": 0.20000000298023224, "step": 464 }, { "completion_length": 733.359375, "epoch": 0.9659828615943911, "grad_norm": 0.07798778265714645, "kl": 0.45794639363884926, "learning_rate": 5.319849668769449e-05, "loss": 0.0002, "reward": 0.6675781644880772, "reward_std": 0.2668223176151514, "rewards/argmax_reward_func": 0.46875, "rewards/format_reward_func": 0.19882812723517418, "step": 465 }, { "completion_length": 673.03125, "epoch": 0.9680602440924435, "grad_norm": 0.06236180663108826, "kl": 0.4699827618896961, "learning_rate": 5.303467838268478e-05, "loss": 0.0002, "reward": 0.8718750439584255, "reward_std": 0.19887377880513668, "rewards/argmax_reward_func": 0.671875, "rewards/format_reward_func": 0.20000000298023224, "step": 466 }, { "completion_length": 695.03125, "epoch": 0.970137626590496, "grad_norm": 0.06047491356730461, "kl": 0.42734822258353233, "learning_rate": 5.287082737463317e-05, "loss": 0.0002, "reward": 0.8875000439584255, "reward_std": 0.1767766922712326, "rewards/argmax_reward_func": 0.6875, "rewards/format_reward_func": 0.20000000298023224, "step": 467 }, { "completion_length": 700.515625, "epoch": 0.9722150090885484, "grad_norm": 0.05264519900083542, "kl": 0.514576718211174, "learning_rate": 5.270694542927088e-05, "loss": 0.0003, "reward": 0.9500000476837158, "reward_std": 0.13258251920342445, "rewards/argmax_reward_func": 0.75, "rewards/format_reward_func": 0.20000000298023224, "step": 468 }, { "completion_length": 685.015625, "epoch": 0.9742923915866009, "grad_norm": 0.07000822573900223, "kl": 0.47352610528469086, "learning_rate": 5.254303431266254e-05, "loss": 0.0002, "reward": 0.8382812961935997, "reward_std": 0.24638251960277557, "rewards/argmax_reward_func": 0.640625, "rewards/format_reward_func": 0.19765625149011612, "step": 469 }, { "completion_length": 778.0, "epoch": 0.9763697740846533, "grad_norm": 0.0691061019897461, "kl": 0.44779016450047493, "learning_rate": 5.2379095791187124e-05, "loss": 0.0002, "reward": 0.8238281644880772, "reward_std": 0.2226281464099884, "rewards/argmax_reward_func": 0.625, "rewards/format_reward_func": 0.19882812723517418, "step": 470 }, { "completion_length": 721.953125, "epoch": 0.9784471565827058, "grad_norm": 0.056446801871061325, "kl": 0.4972013346850872, "learning_rate": 5.2215131631518945e-05, "loss": 0.0002, "reward": 0.8406250476837158, "reward_std": 0.15467960573732853, "rewards/argmax_reward_func": 0.640625, "rewards/format_reward_func": 0.20000000298023224, "step": 471 }, { "completion_length": 734.296875, "epoch": 0.9805245390807582, "grad_norm": 0.04742085933685303, "kl": 0.4290156289935112, "learning_rate": 5.20511436006086e-05, "loss": 0.0002, "reward": 0.9187500439584255, "reward_std": 0.13258251920342445, "rewards/argmax_reward_func": 0.71875, "rewards/format_reward_func": 0.20000000298023224, "step": 472 }, { "completion_length": 699.078125, "epoch": 0.9826019215788107, "grad_norm": 0.06520857661962509, "kl": 0.44061052426695824, "learning_rate": 5.188713346566393e-05, "loss": 0.0002, "reward": 0.9187500476837158, "reward_std": 0.22097086533904076, "rewards/argmax_reward_func": 0.71875, "rewards/format_reward_func": 0.20000000298023224, "step": 473 }, { "completion_length": 816.515625, "epoch": 0.9846793040768631, "grad_norm": 0.06763774901628494, "kl": 0.462362315505743, "learning_rate": 5.172310299413099e-05, "loss": 0.0002, "reward": 0.8875000476837158, "reward_std": 0.2651650384068489, "rewards/argmax_reward_func": 0.6875, "rewards/format_reward_func": 0.20000000298023224, "step": 474 }, { "completion_length": 703.703125, "epoch": 0.9867566865749156, "grad_norm": 0.06497927010059357, "kl": 0.4288316182792187, "learning_rate": 5.1559053953674975e-05, "loss": 0.0002, "reward": 0.8562500476837158, "reward_std": 0.1767766922712326, "rewards/argmax_reward_func": 0.65625, "rewards/format_reward_func": 0.20000000298023224, "step": 475 }, { "completion_length": 734.265625, "epoch": 0.9888340690729681, "grad_norm": 0.061615679413080215, "kl": 0.4229474924504757, "learning_rate": 5.139498811216122e-05, "loss": 0.0002, "reward": 0.7156250383704901, "reward_std": 0.19887377694249153, "rewards/argmax_reward_func": 0.515625, "rewards/format_reward_func": 0.20000000298023224, "step": 476 }, { "completion_length": 745.546875, "epoch": 0.9909114515710206, "grad_norm": 0.061035335063934326, "kl": 0.44998469576239586, "learning_rate": 5.123090723763606e-05, "loss": 0.0002, "reward": 0.7781250476837158, "reward_std": 0.19887377694249153, "rewards/argmax_reward_func": 0.578125, "rewards/format_reward_func": 0.20000000298023224, "step": 477 }, { "completion_length": 726.9375, "epoch": 0.992988834069073, "grad_norm": 0.05501917377114296, "kl": 0.4711364693939686, "learning_rate": 5.106681309830791e-05, "loss": 0.0002, "reward": 0.9312500469386578, "reward_std": 0.1590990237891674, "rewards/argmax_reward_func": 0.734375, "rewards/format_reward_func": 0.19687500223517418, "step": 478 }, { "completion_length": 731.265625, "epoch": 0.9950662165671255, "grad_norm": 0.06437938660383224, "kl": 0.488413542509079, "learning_rate": 5.090270746252802e-05, "loss": 0.0002, "reward": 0.8406250476837158, "reward_std": 0.19887377694249153, "rewards/argmax_reward_func": 0.640625, "rewards/format_reward_func": 0.20000000298023224, "step": 479 }, { "completion_length": 877.296875, "epoch": 0.9971435990651779, "grad_norm": 0.05059191957116127, "kl": 0.3898215554654598, "learning_rate": 5.073859209877168e-05, "loss": 0.0002, "reward": 0.9179687947034836, "reward_std": 0.22207572311162949, "rewards/argmax_reward_func": 0.71875, "rewards/format_reward_func": 0.19921875186264515, "step": 480 }, { "completion_length": 737.34375, "epoch": 0.9992209815632304, "grad_norm": 0.05600970238447189, "kl": 0.4228545166552067, "learning_rate": 5.057446877561884e-05, "loss": 0.0002, "reward": 0.9179687909781933, "reward_std": 0.17788155004382133, "rewards/argmax_reward_func": 0.71875, "rewards/format_reward_func": 0.1992187537252903, "step": 481 }, { "completion_length": 867.2916666666666, "epoch": 1.0, "grad_norm": 0.0311344675719738, "kl": 0.4165251553058624, "learning_rate": 5.0410339261735384e-05, "loss": 0.0001, "reward": 0.9500000476837158, "reward_std": 0.23570225636164346, "rewards/argmax_reward_func": 0.75, "rewards/format_reward_func": 0.20000000298023224, "step": 482 }, { "completion_length": 721.59375, "epoch": 1.0020773824980524, "grad_norm": 0.08058605343103409, "kl": 0.4754480682313442, "learning_rate": 5.0246205325853826e-05, "loss": 0.0002, "reward": 0.8250000476837158, "reward_std": 0.30935921147465706, "rewards/argmax_reward_func": 0.625, "rewards/format_reward_func": 0.20000000298023224, "step": 483 }, { "completion_length": 749.40625, "epoch": 1.004154764996105, "grad_norm": 0.06808894872665405, "kl": 0.4143032245337963, "learning_rate": 5.008206873675433e-05, "loss": 0.0002, "reward": 0.9343750476837158, "reward_std": 0.24306795001029968, "rewards/argmax_reward_func": 0.734375, "rewards/format_reward_func": 0.20000000298023224, "step": 484 }, { "completion_length": 739.546875, "epoch": 1.0062321474941573, "grad_norm": 0.04413120448589325, "kl": 0.4007079564034939, "learning_rate": 4.991793126324568e-05, "loss": 0.0002, "reward": 0.9656250476837158, "reward_std": 0.11048543266952038, "rewards/argmax_reward_func": 0.765625, "rewards/format_reward_func": 0.20000000298023224, "step": 485 }, { "completion_length": 764.390625, "epoch": 1.0083095299922098, "grad_norm": 0.05544662848114967, "kl": 0.40518468618392944, "learning_rate": 4.9753794674146206e-05, "loss": 0.0002, "reward": 1.0125000476837158, "reward_std": 0.17677669040858746, "rewards/argmax_reward_func": 0.8125, "rewards/format_reward_func": 0.20000000298023224, "step": 486 }, { "completion_length": 770.71875, "epoch": 1.0103869124902622, "grad_norm": 0.0641309842467308, "kl": 0.41656066104769707, "learning_rate": 4.9589660738264614e-05, "loss": 0.0002, "reward": 0.8718750476837158, "reward_std": 0.19887377880513668, "rewards/argmax_reward_func": 0.671875, "rewards/format_reward_func": 0.20000000298023224, "step": 487 }, { "completion_length": 774.984375, "epoch": 1.0124642949883147, "grad_norm": 0.06294507533311844, "kl": 0.41369784995913506, "learning_rate": 4.9425531224381163e-05, "loss": 0.0002, "reward": 0.7625000402331352, "reward_std": 0.22097086533904076, "rewards/argmax_reward_func": 0.5625, "rewards/format_reward_func": 0.20000000298023224, "step": 488 }, { "completion_length": 725.171875, "epoch": 1.0145416774863671, "grad_norm": 0.058232299983501434, "kl": 0.4683380052447319, "learning_rate": 4.926140790122835e-05, "loss": 0.0002, "reward": 0.8406250476837158, "reward_std": 0.19887377694249153, "rewards/argmax_reward_func": 0.640625, "rewards/format_reward_func": 0.20000000298023224, "step": 489 }, { "completion_length": 782.046875, "epoch": 1.0166190599844196, "grad_norm": 0.053580548614263535, "kl": 0.42908982560038567, "learning_rate": 4.909729253747197e-05, "loss": 0.0002, "reward": 0.8093750476837158, "reward_std": 0.15467960573732853, "rewards/argmax_reward_func": 0.609375, "rewards/format_reward_func": 0.20000000298023224, "step": 490 }, { "completion_length": 812.921875, "epoch": 1.018696442482472, "grad_norm": 0.07418368011713028, "kl": 0.4282660707831383, "learning_rate": 4.893318690169211e-05, "loss": 0.0002, "reward": 0.7625000476837158, "reward_std": 0.30935920774936676, "rewards/argmax_reward_func": 0.5625, "rewards/format_reward_func": 0.20000000298023224, "step": 491 }, { "completion_length": 772.4375, "epoch": 1.0207738249805245, "grad_norm": 0.053191013634204865, "kl": 0.42502470314502716, "learning_rate": 4.876909276236395e-05, "loss": 0.0002, "reward": 0.8093750439584255, "reward_std": 0.15467960573732853, "rewards/argmax_reward_func": 0.609375, "rewards/format_reward_func": 0.20000000298023224, "step": 492 }, { "completion_length": 752.390625, "epoch": 1.022851207478577, "grad_norm": 0.05855753272771835, "kl": 0.42473678290843964, "learning_rate": 4.8605011887838797e-05, "loss": 0.0002, "reward": 0.8554687947034836, "reward_std": 0.22207572497427464, "rewards/argmax_reward_func": 0.65625, "rewards/format_reward_func": 0.19921875186264515, "step": 493 }, { "completion_length": 757.84375, "epoch": 1.0249285899766294, "grad_norm": 0.06828629225492477, "kl": 0.409926887601614, "learning_rate": 4.844094604632502e-05, "loss": 0.0002, "reward": 0.9968750476837158, "reward_std": 0.24306795001029968, "rewards/argmax_reward_func": 0.796875, "rewards/format_reward_func": 0.20000000298023224, "step": 494 }, { "completion_length": 781.3125, "epoch": 1.0270059724746818, "grad_norm": 0.07438631355762482, "kl": 0.41192958503961563, "learning_rate": 4.827689700586902e-05, "loss": 0.0002, "reward": 0.8093750439584255, "reward_std": 0.331456296145916, "rewards/argmax_reward_func": 0.609375, "rewards/format_reward_func": 0.20000000298023224, "step": 495 }, { "completion_length": 793.234375, "epoch": 1.0290833549727343, "grad_norm": 0.05079561844468117, "kl": 0.4043182320892811, "learning_rate": 4.811286653433609e-05, "loss": 0.0002, "reward": 0.7468750476837158, "reward_std": 0.15467960387468338, "rewards/argmax_reward_func": 0.546875, "rewards/format_reward_func": 0.20000000298023224, "step": 496 }, { "completion_length": 781.796875, "epoch": 1.0311607374707867, "grad_norm": 0.05202874913811684, "kl": 0.413474939763546, "learning_rate": 4.794885639939142e-05, "loss": 0.0002, "reward": 0.7468750439584255, "reward_std": 0.15467960573732853, "rewards/argmax_reward_func": 0.546875, "rewards/format_reward_func": 0.20000000298023224, "step": 497 }, { "completion_length": 810.25, "epoch": 1.0332381199688392, "grad_norm": 0.05652881786227226, "kl": 0.40745414793491364, "learning_rate": 4.7784868368481067e-05, "loss": 0.0002, "reward": 0.9335937947034836, "reward_std": 0.1999786328524351, "rewards/argmax_reward_func": 0.734375, "rewards/format_reward_func": 0.1992187537252903, "step": 498 }, { "completion_length": 799.765625, "epoch": 1.0353155024668916, "grad_norm": 0.0548785924911499, "kl": 0.39410270750522614, "learning_rate": 4.762090420881289e-05, "loss": 0.0002, "reward": 0.9953125491738319, "reward_std": 0.1568893175572157, "rewards/argmax_reward_func": 0.796875, "rewards/format_reward_func": 0.1984375026077032, "step": 499 }, { "completion_length": 777.75, "epoch": 1.037392884964944, "grad_norm": 0.06177806481719017, "kl": 0.6858577094972134, "learning_rate": 4.745696568733748e-05, "loss": 0.0003, "reward": 0.8250000439584255, "reward_std": 0.22097086533904076, "rewards/argmax_reward_func": 0.625, "rewards/format_reward_func": 0.20000000298023224, "step": 500 } ], "logging_steps": 1, "max_steps": 962, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }