|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.24, |
|
"eval_steps": 500, |
|
"global_step": 450, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 505.9479351043701, |
|
"epoch": 0.0010666666666666667, |
|
"grad_norm": 0.1341146091763646, |
|
"kl": 0.0, |
|
"learning_rate": 7.142857142857142e-08, |
|
"loss": -0.0, |
|
"reward": 0.3046875079162419, |
|
"reward_std": 0.4547263579443097, |
|
"rewards/equation_reward_func": 0.03645833441987634, |
|
"rewards/format_reward_func": 0.26822917466051877, |
|
"step": 2 |
|
}, |
|
{ |
|
"completion_length": 511.1562671661377, |
|
"epoch": 0.0021333333333333334, |
|
"grad_norm": 0.11909890519385766, |
|
"kl": 0.0004194974899291992, |
|
"learning_rate": 1.4285714285714285e-07, |
|
"loss": 0.0, |
|
"reward": 0.3098958386108279, |
|
"reward_std": 0.4707766156643629, |
|
"rewards/equation_reward_func": 0.0494791679084301, |
|
"rewards/format_reward_func": 0.26041667349636555, |
|
"step": 4 |
|
}, |
|
{ |
|
"completion_length": 484.7057456970215, |
|
"epoch": 0.0032, |
|
"grad_norm": 0.10838779091076944, |
|
"kl": 0.0003948211669921875, |
|
"learning_rate": 2.1428571428571426e-07, |
|
"loss": 0.0, |
|
"reward": 0.25520834187045693, |
|
"reward_std": 0.4062541304156184, |
|
"rewards/equation_reward_func": 0.04427083418704569, |
|
"rewards/format_reward_func": 0.2109375053551048, |
|
"step": 6 |
|
}, |
|
{ |
|
"completion_length": 502.9635524749756, |
|
"epoch": 0.004266666666666667, |
|
"grad_norm": 0.11359857035268194, |
|
"kl": 0.00040793418884277344, |
|
"learning_rate": 2.857142857142857e-07, |
|
"loss": 0.0, |
|
"reward": 0.3229166753590107, |
|
"reward_std": 0.4702935107052326, |
|
"rewards/equation_reward_func": 0.07291666860692203, |
|
"rewards/format_reward_func": 0.250000006519258, |
|
"step": 8 |
|
}, |
|
{ |
|
"completion_length": 472.85418128967285, |
|
"epoch": 0.005333333333333333, |
|
"grad_norm": 0.13319802291837166, |
|
"kl": 0.00041985511779785156, |
|
"learning_rate": 3.5714285714285716e-07, |
|
"loss": 0.0, |
|
"reward": 0.3359375074505806, |
|
"reward_std": 0.47765984758734703, |
|
"rewards/equation_reward_func": 0.052083334885537624, |
|
"rewards/format_reward_func": 0.28385417396202683, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 474.9609489440918, |
|
"epoch": 0.0064, |
|
"grad_norm": 0.1283866658242959, |
|
"kl": 0.0004889965057373047, |
|
"learning_rate": 4.285714285714285e-07, |
|
"loss": 0.0, |
|
"reward": 0.40625000512227416, |
|
"reward_std": 0.5288777491077781, |
|
"rewards/equation_reward_func": 0.0703125016298145, |
|
"rewards/format_reward_func": 0.33593750884756446, |
|
"step": 12 |
|
}, |
|
{ |
|
"completion_length": 487.7526226043701, |
|
"epoch": 0.007466666666666667, |
|
"grad_norm": 0.11491878824082066, |
|
"kl": 0.0008172988891601562, |
|
"learning_rate": 5e-07, |
|
"loss": 0.0, |
|
"reward": 0.3984375139698386, |
|
"reward_std": 0.4919305704534054, |
|
"rewards/equation_reward_func": 0.05208333395421505, |
|
"rewards/format_reward_func": 0.3463541781529784, |
|
"step": 14 |
|
}, |
|
{ |
|
"completion_length": 468.70313835144043, |
|
"epoch": 0.008533333333333334, |
|
"grad_norm": 0.12246266971775394, |
|
"kl": 0.0011203289031982422, |
|
"learning_rate": 4.999740409224932e-07, |
|
"loss": 0.0, |
|
"reward": 0.5494791846722364, |
|
"reward_std": 0.5318632125854492, |
|
"rewards/equation_reward_func": 0.08854167023673654, |
|
"rewards/format_reward_func": 0.46093751303851604, |
|
"step": 16 |
|
}, |
|
{ |
|
"completion_length": 454.82292556762695, |
|
"epoch": 0.0096, |
|
"grad_norm": 0.10480668657811888, |
|
"kl": 0.00298309326171875, |
|
"learning_rate": 4.998961690809627e-07, |
|
"loss": 0.0, |
|
"reward": 0.6796875204890966, |
|
"reward_std": 0.5534657873213291, |
|
"rewards/equation_reward_func": 0.06770833465270698, |
|
"rewards/format_reward_func": 0.611979192122817, |
|
"step": 18 |
|
}, |
|
{ |
|
"completion_length": 453.3411560058594, |
|
"epoch": 0.010666666666666666, |
|
"grad_norm": 0.11208435254258003, |
|
"kl": 0.005069732666015625, |
|
"learning_rate": 4.997664006472578e-07, |
|
"loss": 0.0, |
|
"reward": 0.7500000186264515, |
|
"reward_std": 0.5607063695788383, |
|
"rewards/equation_reward_func": 0.0859375016298145, |
|
"rewards/format_reward_func": 0.6640625167638063, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 450.89063262939453, |
|
"epoch": 0.011733333333333333, |
|
"grad_norm": 0.10552116383248636, |
|
"kl": 0.005932807922363281, |
|
"learning_rate": 4.995847625707292e-07, |
|
"loss": 0.0, |
|
"reward": 0.8593750149011612, |
|
"reward_std": 0.5123661290854216, |
|
"rewards/equation_reward_func": 0.09895833511836827, |
|
"rewards/format_reward_func": 0.7604166939854622, |
|
"step": 22 |
|
}, |
|
{ |
|
"completion_length": 449.1198043823242, |
|
"epoch": 0.0128, |
|
"grad_norm": 0.10482422281110657, |
|
"kl": 0.007808685302734375, |
|
"learning_rate": 4.993512925726318e-07, |
|
"loss": 0.0, |
|
"reward": 0.8958333544433117, |
|
"reward_std": 0.44584160670638084, |
|
"rewards/equation_reward_func": 0.07812500302679837, |
|
"rewards/format_reward_func": 0.8177083507180214, |
|
"step": 24 |
|
}, |
|
{ |
|
"completion_length": 437.75782012939453, |
|
"epoch": 0.013866666666666666, |
|
"grad_norm": 0.08078578907154227, |
|
"kl": 0.0073490142822265625, |
|
"learning_rate": 4.990660391382923e-07, |
|
"loss": 0.0, |
|
"reward": 0.9505208656191826, |
|
"reward_std": 0.40651129884645343, |
|
"rewards/equation_reward_func": 0.09895833604969084, |
|
"rewards/format_reward_func": 0.8515625186264515, |
|
"step": 26 |
|
}, |
|
{ |
|
"completion_length": 441.77345275878906, |
|
"epoch": 0.014933333333333333, |
|
"grad_norm": 0.08279347003242855, |
|
"kl": 0.00899505615234375, |
|
"learning_rate": 4.987290615070384e-07, |
|
"loss": 0.0, |
|
"reward": 0.9583333544433117, |
|
"reward_std": 0.31725937221199274, |
|
"rewards/equation_reward_func": 0.06510416814126074, |
|
"rewards/format_reward_func": 0.893229179084301, |
|
"step": 28 |
|
}, |
|
{ |
|
"completion_length": 438.5364685058594, |
|
"epoch": 0.016, |
|
"grad_norm": 0.07534793657846317, |
|
"kl": 0.01221466064453125, |
|
"learning_rate": 4.983404296598978e-07, |
|
"loss": 0.0, |
|
"reward": 1.0104166828095913, |
|
"reward_std": 0.286367348395288, |
|
"rewards/equation_reward_func": 0.08072916860692203, |
|
"rewards/format_reward_func": 0.9296875186264515, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 446.8125114440918, |
|
"epoch": 0.017066666666666667, |
|
"grad_norm": 0.07968622664869553, |
|
"kl": 0.011959075927734375, |
|
"learning_rate": 4.979002243050646e-07, |
|
"loss": 0.0, |
|
"reward": 1.0026041977107525, |
|
"reward_std": 0.344503759406507, |
|
"rewards/equation_reward_func": 0.09635416860692203, |
|
"rewards/format_reward_func": 0.9062500223517418, |
|
"step": 32 |
|
}, |
|
{ |
|
"completion_length": 427.65886306762695, |
|
"epoch": 0.018133333333333335, |
|
"grad_norm": 0.08471832672268678, |
|
"kl": 0.027118682861328125, |
|
"learning_rate": 4.974085368611381e-07, |
|
"loss": 0.0, |
|
"reward": 1.0442708693444729, |
|
"reward_std": 0.2840048740617931, |
|
"rewards/equation_reward_func": 0.1015625037252903, |
|
"rewards/format_reward_func": 0.9427083544433117, |
|
"step": 34 |
|
}, |
|
{ |
|
"completion_length": 444.25261878967285, |
|
"epoch": 0.0192, |
|
"grad_norm": 0.06852883069586094, |
|
"kl": 0.01403045654296875, |
|
"learning_rate": 4.968654694381379e-07, |
|
"loss": 0.0, |
|
"reward": 0.9921875335276127, |
|
"reward_std": 0.21781930467113853, |
|
"rewards/equation_reward_func": 0.05208333511836827, |
|
"rewards/format_reward_func": 0.9401041865348816, |
|
"step": 36 |
|
}, |
|
{ |
|
"completion_length": 383.59896659851074, |
|
"epoch": 0.020266666666666665, |
|
"grad_norm": 0.08057979196934888, |
|
"kl": 0.0169677734375, |
|
"learning_rate": 4.962711348162987e-07, |
|
"loss": 0.0, |
|
"reward": 1.1223958656191826, |
|
"reward_std": 0.2702699927613139, |
|
"rewards/equation_reward_func": 0.14062500395812094, |
|
"rewards/format_reward_func": 0.9817708507180214, |
|
"step": 38 |
|
}, |
|
{ |
|
"completion_length": 411.5677185058594, |
|
"epoch": 0.021333333333333333, |
|
"grad_norm": 0.07850698291787955, |
|
"kl": 0.017246246337890625, |
|
"learning_rate": 4.956256564226487e-07, |
|
"loss": 0.0, |
|
"reward": 1.0989583656191826, |
|
"reward_std": 0.2887088777497411, |
|
"rewards/equation_reward_func": 0.13281250279396772, |
|
"rewards/format_reward_func": 0.9661458507180214, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 396.64584159851074, |
|
"epoch": 0.0224, |
|
"grad_norm": 0.07505103817031399, |
|
"kl": 0.017597198486328125, |
|
"learning_rate": 4.949291683053768e-07, |
|
"loss": 0.0, |
|
"reward": 1.0807291977107525, |
|
"reward_std": 0.2670950279571116, |
|
"rewards/equation_reward_func": 0.11458333698101342, |
|
"rewards/format_reward_func": 0.9661458507180214, |
|
"step": 42 |
|
}, |
|
{ |
|
"completion_length": 387.0468864440918, |
|
"epoch": 0.023466666666666667, |
|
"grad_norm": 0.09175240895759779, |
|
"kl": 0.017871856689453125, |
|
"learning_rate": 4.941818151059955e-07, |
|
"loss": 0.0, |
|
"reward": 1.1015625335276127, |
|
"reward_std": 0.2870901683345437, |
|
"rewards/equation_reward_func": 0.1354166711680591, |
|
"rewards/format_reward_func": 0.9661458507180214, |
|
"step": 44 |
|
}, |
|
{ |
|
"completion_length": 400.3645935058594, |
|
"epoch": 0.024533333333333334, |
|
"grad_norm": 0.09491357639118295, |
|
"kl": 0.019161224365234375, |
|
"learning_rate": 4.933837520293017e-07, |
|
"loss": 0.0, |
|
"reward": 1.070312537252903, |
|
"reward_std": 0.2785795754753053, |
|
"rewards/equation_reward_func": 0.10937500442378223, |
|
"rewards/format_reward_func": 0.9609375260770321, |
|
"step": 46 |
|
}, |
|
{ |
|
"completion_length": 403.8671989440918, |
|
"epoch": 0.0256, |
|
"grad_norm": 0.08449768835766272, |
|
"kl": 0.01947021484375, |
|
"learning_rate": 4.925351448111454e-07, |
|
"loss": 0.0, |
|
"reward": 1.0598958767950535, |
|
"reward_std": 0.1955897193402052, |
|
"rewards/equation_reward_func": 0.08333333604969084, |
|
"rewards/format_reward_func": 0.9765625149011612, |
|
"step": 48 |
|
}, |
|
{ |
|
"completion_length": 384.1770896911621, |
|
"epoch": 0.02666666666666667, |
|
"grad_norm": 0.09879170444522951, |
|
"kl": 0.02040863037109375, |
|
"learning_rate": 4.91636169684011e-07, |
|
"loss": 0.0, |
|
"reward": 1.1223958730697632, |
|
"reward_std": 0.31093722581863403, |
|
"rewards/equation_reward_func": 0.1406250037252903, |
|
"rewards/format_reward_func": 0.9817708432674408, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 391.669282913208, |
|
"epoch": 0.027733333333333332, |
|
"grad_norm": 0.10757568231914379, |
|
"kl": 0.0244903564453125, |
|
"learning_rate": 4.906870133404186e-07, |
|
"loss": 0.0, |
|
"reward": 1.1197916977107525, |
|
"reward_std": 0.3494974756613374, |
|
"rewards/equation_reward_func": 0.15885417140088975, |
|
"rewards/format_reward_func": 0.9609375186264515, |
|
"step": 52 |
|
}, |
|
{ |
|
"completion_length": 387.16407012939453, |
|
"epoch": 0.0288, |
|
"grad_norm": 0.0916962283697697, |
|
"kl": 0.02394866943359375, |
|
"learning_rate": 4.896878728941531e-07, |
|
"loss": 0.0, |
|
"reward": 1.1067708656191826, |
|
"reward_std": 0.25607615802437067, |
|
"rewards/equation_reward_func": 0.1328125020954758, |
|
"rewards/format_reward_func": 0.9739583544433117, |
|
"step": 54 |
|
}, |
|
{ |
|
"completion_length": 346.4114646911621, |
|
"epoch": 0.029866666666666666, |
|
"grad_norm": 0.09993350369732659, |
|
"kl": 0.0276031494140625, |
|
"learning_rate": 4.886389558393284e-07, |
|
"loss": 0.0, |
|
"reward": 1.1510416939854622, |
|
"reward_std": 0.2859157114289701, |
|
"rewards/equation_reward_func": 0.16145833767950535, |
|
"rewards/format_reward_func": 0.9895833432674408, |
|
"step": 56 |
|
}, |
|
{ |
|
"completion_length": 361.99219512939453, |
|
"epoch": 0.030933333333333334, |
|
"grad_norm": 0.11653485215024455, |
|
"kl": 0.02984619140625, |
|
"learning_rate": 4.875404800072976e-07, |
|
"loss": 0.0, |
|
"reward": 1.1640625447034836, |
|
"reward_std": 0.3471745736896992, |
|
"rewards/equation_reward_func": 0.18750000558793545, |
|
"rewards/format_reward_func": 0.9765625149011612, |
|
"step": 58 |
|
}, |
|
{ |
|
"completion_length": 367.1015739440918, |
|
"epoch": 0.032, |
|
"grad_norm": 0.07180913754511904, |
|
"kl": 0.03044891357421875, |
|
"learning_rate": 4.86392673521415e-07, |
|
"loss": 0.0, |
|
"reward": 1.0911458805203438, |
|
"reward_std": 0.1999878236092627, |
|
"rewards/equation_reward_func": 0.10416666930541396, |
|
"rewards/format_reward_func": 0.9869791753590107, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 366.5208435058594, |
|
"epoch": 0.03306666666666667, |
|
"grad_norm": 0.08088172620555445, |
|
"kl": 0.0330810546875, |
|
"learning_rate": 4.851957747496606e-07, |
|
"loss": 0.0, |
|
"reward": 1.1510416939854622, |
|
"reward_std": 0.28296295227482915, |
|
"rewards/equation_reward_func": 0.16927083488553762, |
|
"rewards/format_reward_func": 0.9817708469927311, |
|
"step": 62 |
|
}, |
|
{ |
|
"completion_length": 357.73178482055664, |
|
"epoch": 0.034133333333333335, |
|
"grad_norm": 0.0844167380266008, |
|
"kl": 0.03631591796875, |
|
"learning_rate": 4.839500322551386e-07, |
|
"loss": 0.0, |
|
"reward": 1.1197916939854622, |
|
"reward_std": 0.2452517431229353, |
|
"rewards/equation_reward_func": 0.14843750186264515, |
|
"rewards/format_reward_func": 0.9713541753590107, |
|
"step": 64 |
|
}, |
|
{ |
|
"completion_length": 353.9739685058594, |
|
"epoch": 0.0352, |
|
"grad_norm": 0.0778527671209511, |
|
"kl": 0.041229248046875, |
|
"learning_rate": 4.826557047444563e-07, |
|
"loss": 0.0, |
|
"reward": 1.1796875298023224, |
|
"reward_std": 0.30663188826292753, |
|
"rewards/equation_reward_func": 0.19791667279787362, |
|
"rewards/format_reward_func": 0.9817708469927311, |
|
"step": 66 |
|
}, |
|
{ |
|
"completion_length": 348.2239685058594, |
|
"epoch": 0.03626666666666667, |
|
"grad_norm": 0.07408528500512421, |
|
"kl": 0.044708251953125, |
|
"learning_rate": 4.813130610139993e-07, |
|
"loss": 0.0, |
|
"reward": 1.0729167014360428, |
|
"reward_std": 0.17930190591141582, |
|
"rewards/equation_reward_func": 0.0885416695382446, |
|
"rewards/format_reward_func": 0.9843750149011612, |
|
"step": 68 |
|
}, |
|
{ |
|
"completion_length": 318.35938835144043, |
|
"epoch": 0.037333333333333336, |
|
"grad_norm": 0.10471668022395769, |
|
"kl": 0.0505828857421875, |
|
"learning_rate": 4.799223798941089e-07, |
|
"loss": 0.0001, |
|
"reward": 1.187500037252903, |
|
"reward_std": 0.2974981819279492, |
|
"rewards/equation_reward_func": 0.2031250053551048, |
|
"rewards/format_reward_func": 0.9843750111758709, |
|
"step": 70 |
|
}, |
|
{ |
|
"completion_length": 312.2213659286499, |
|
"epoch": 0.0384, |
|
"grad_norm": 0.08445574387607607, |
|
"kl": 0.058990478515625, |
|
"learning_rate": 4.78483950191177e-07, |
|
"loss": 0.0001, |
|
"reward": 1.1562500298023224, |
|
"reward_std": 0.23554043704643846, |
|
"rewards/equation_reward_func": 0.17187500651925802, |
|
"rewards/format_reward_func": 0.9843750111758709, |
|
"step": 72 |
|
}, |
|
{ |
|
"completion_length": 320.13542556762695, |
|
"epoch": 0.039466666666666664, |
|
"grad_norm": 0.10154941280104149, |
|
"kl": 0.0615997314453125, |
|
"learning_rate": 4.769980706276687e-07, |
|
"loss": 0.0001, |
|
"reward": 1.1770833730697632, |
|
"reward_std": 0.26962050748988986, |
|
"rewards/equation_reward_func": 0.19270834000781178, |
|
"rewards/format_reward_func": 0.9843750111758709, |
|
"step": 74 |
|
}, |
|
{ |
|
"completion_length": 334.70052909851074, |
|
"epoch": 0.04053333333333333, |
|
"grad_norm": 0.08509345877302323, |
|
"kl": 0.061676025390625, |
|
"learning_rate": 4.7546504978008595e-07, |
|
"loss": 0.0001, |
|
"reward": 1.1458333730697632, |
|
"reward_std": 0.20033816620707512, |
|
"rewards/equation_reward_func": 0.15885417233221233, |
|
"rewards/format_reward_func": 0.986979179084301, |
|
"step": 76 |
|
}, |
|
{ |
|
"completion_length": 333.23438453674316, |
|
"epoch": 0.0416, |
|
"grad_norm": 0.10027144175078107, |
|
"kl": 0.065399169921875, |
|
"learning_rate": 4.738852060148848e-07, |
|
"loss": 0.0001, |
|
"reward": 1.1171875447034836, |
|
"reward_std": 0.23261011950671673, |
|
"rewards/equation_reward_func": 0.13541667070239782, |
|
"rewards/format_reward_func": 0.9817708507180214, |
|
"step": 78 |
|
}, |
|
{ |
|
"completion_length": 331.69011878967285, |
|
"epoch": 0.042666666666666665, |
|
"grad_norm": 0.07507534432076213, |
|
"kl": 0.071014404296875, |
|
"learning_rate": 4.722588674223593e-07, |
|
"loss": 0.0001, |
|
"reward": 1.1276042014360428, |
|
"reward_std": 0.2506814347580075, |
|
"rewards/equation_reward_func": 0.14843750442378223, |
|
"rewards/format_reward_func": 0.9791666828095913, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 344.0781364440918, |
|
"epoch": 0.04373333333333333, |
|
"grad_norm": 0.09863254302808237, |
|
"kl": 0.070526123046875, |
|
"learning_rate": 4.70586371748506e-07, |
|
"loss": 0.0001, |
|
"reward": 1.2031250447034836, |
|
"reward_std": 0.2764001186005771, |
|
"rewards/equation_reward_func": 0.2109375074505806, |
|
"rewards/format_reward_func": 0.9921875074505806, |
|
"step": 82 |
|
}, |
|
{ |
|
"completion_length": 330.4479274749756, |
|
"epoch": 0.0448, |
|
"grad_norm": 0.10155910053999813, |
|
"kl": 0.07550048828125, |
|
"learning_rate": 4.6886806632488363e-07, |
|
"loss": 0.0001, |
|
"reward": 1.2708333730697632, |
|
"reward_std": 0.3232872476801276, |
|
"rewards/equation_reward_func": 0.28906251140870154, |
|
"rewards/format_reward_func": 0.9817708469927311, |
|
"step": 84 |
|
}, |
|
{ |
|
"completion_length": 341.1197986602783, |
|
"epoch": 0.04586666666666667, |
|
"grad_norm": 0.09455703883061281, |
|
"kl": 0.07513427734375, |
|
"learning_rate": 4.6710430799648143e-07, |
|
"loss": 0.0001, |
|
"reward": 1.1953125298023224, |
|
"reward_std": 0.3194303079508245, |
|
"rewards/equation_reward_func": 0.2291666753590107, |
|
"rewards/format_reward_func": 0.9661458507180214, |
|
"step": 86 |
|
}, |
|
{ |
|
"completion_length": 341.54427909851074, |
|
"epoch": 0.046933333333333334, |
|
"grad_norm": 0.08290471243926564, |
|
"kl": 0.077423095703125, |
|
"learning_rate": 4.652954630476127e-07, |
|
"loss": 0.0001, |
|
"reward": 1.1979167014360428, |
|
"reward_std": 0.2291324818506837, |
|
"rewards/equation_reward_func": 0.2239583395421505, |
|
"rewards/format_reward_func": 0.9739583469927311, |
|
"step": 88 |
|
}, |
|
{ |
|
"completion_length": 335.0989627838135, |
|
"epoch": 0.048, |
|
"grad_norm": 0.10748566516697469, |
|
"kl": 0.088287353515625, |
|
"learning_rate": 4.6344190712584713e-07, |
|
"loss": 0.0001, |
|
"reward": 1.1692708730697632, |
|
"reward_std": 0.3015799345448613, |
|
"rewards/equation_reward_func": 0.1979166700039059, |
|
"rewards/format_reward_func": 0.9713541902601719, |
|
"step": 90 |
|
}, |
|
{ |
|
"completion_length": 314.3177185058594, |
|
"epoch": 0.04906666666666667, |
|
"grad_norm": 0.10758855837243832, |
|
"kl": 0.08538818359375, |
|
"learning_rate": 4.615440251639995e-07, |
|
"loss": 0.0001, |
|
"reward": 1.3151041977107525, |
|
"reward_std": 0.3682410903275013, |
|
"rewards/equation_reward_func": 0.33593750838190317, |
|
"rewards/format_reward_func": 0.9791666753590107, |
|
"step": 92 |
|
}, |
|
{ |
|
"completion_length": 313.01302909851074, |
|
"epoch": 0.050133333333333335, |
|
"grad_norm": 0.124066638172858, |
|
"kl": 0.0859375, |
|
"learning_rate": 4.596022113001894e-07, |
|
"loss": 0.0001, |
|
"reward": 1.276041705161333, |
|
"reward_std": 0.30914933141320944, |
|
"rewards/equation_reward_func": 0.29427084303461015, |
|
"rewards/format_reward_func": 0.9817708432674408, |
|
"step": 94 |
|
}, |
|
{ |
|
"completion_length": 334.85417556762695, |
|
"epoch": 0.0512, |
|
"grad_norm": 0.1018803932324317, |
|
"kl": 0.088104248046875, |
|
"learning_rate": 4.576168687959895e-07, |
|
"loss": 0.0001, |
|
"reward": 1.2135417014360428, |
|
"reward_std": 0.2573512555100024, |
|
"rewards/equation_reward_func": 0.2395833416376263, |
|
"rewards/format_reward_func": 0.9739583469927311, |
|
"step": 96 |
|
}, |
|
{ |
|
"completion_length": 351.9817810058594, |
|
"epoch": 0.05226666666666667, |
|
"grad_norm": 0.10509374857128695, |
|
"kl": 0.098785400390625, |
|
"learning_rate": 4.555884099526793e-07, |
|
"loss": 0.0001, |
|
"reward": 1.250000037252903, |
|
"reward_std": 0.29483586829155684, |
|
"rewards/equation_reward_func": 0.27083334093913436, |
|
"rewards/format_reward_func": 0.9791666753590107, |
|
"step": 98 |
|
}, |
|
{ |
|
"completion_length": 358.38021755218506, |
|
"epoch": 0.05333333333333334, |
|
"grad_norm": 0.0978516383302316, |
|
"kl": 0.08575439453125, |
|
"learning_rate": 4.5351725602562174e-07, |
|
"loss": 0.0001, |
|
"reward": 1.2942708656191826, |
|
"reward_std": 0.32903878297656775, |
|
"rewards/equation_reward_func": 0.3203125046566129, |
|
"rewards/format_reward_func": 0.9739583432674408, |
|
"step": 100 |
|
}, |
|
{ |
|
"completion_length": 361.90365409851074, |
|
"epoch": 0.0544, |
|
"grad_norm": 0.07892841773395727, |
|
"kl": 0.092681884765625, |
|
"learning_rate": 4.514038371367791e-07, |
|
"loss": 0.0001, |
|
"reward": 1.2838542014360428, |
|
"reward_std": 0.23603887297213078, |
|
"rewards/equation_reward_func": 0.2942708432674408, |
|
"rewards/format_reward_func": 0.9895833395421505, |
|
"step": 102 |
|
}, |
|
{ |
|
"completion_length": 370.2447986602783, |
|
"epoch": 0.055466666666666664, |
|
"grad_norm": 0.07956969957231312, |
|
"kl": 0.088226318359375, |
|
"learning_rate": 4.4924859218538936e-07, |
|
"loss": 0.0001, |
|
"reward": 1.2682292088866234, |
|
"reward_std": 0.2611841419711709, |
|
"rewards/equation_reward_func": 0.2838541760575026, |
|
"rewards/format_reward_func": 0.9843750074505806, |
|
"step": 104 |
|
}, |
|
{ |
|
"completion_length": 405.403657913208, |
|
"epoch": 0.05653333333333333, |
|
"grad_norm": 0.10207984517578009, |
|
"kl": 0.0877227783203125, |
|
"learning_rate": 4.470519687568185e-07, |
|
"loss": 0.0001, |
|
"reward": 1.2786458618938923, |
|
"reward_std": 0.27022232208400965, |
|
"rewards/equation_reward_func": 0.31250000838190317, |
|
"rewards/format_reward_func": 0.9661458507180214, |
|
"step": 106 |
|
}, |
|
{ |
|
"completion_length": 392.93490982055664, |
|
"epoch": 0.0576, |
|
"grad_norm": 0.08438917528245744, |
|
"kl": 0.0877685546875, |
|
"learning_rate": 4.4481442302960923e-07, |
|
"loss": 0.0001, |
|
"reward": 1.3072917014360428, |
|
"reward_std": 0.31525306357070804, |
|
"rewards/equation_reward_func": 0.34375000838190317, |
|
"rewards/format_reward_func": 0.963541679084301, |
|
"step": 108 |
|
}, |
|
{ |
|
"completion_length": 399.8698024749756, |
|
"epoch": 0.058666666666666666, |
|
"grad_norm": 0.08270590545214734, |
|
"kl": 0.09637451171875, |
|
"learning_rate": 4.4253641968074505e-07, |
|
"loss": 0.0001, |
|
"reward": 1.268229190260172, |
|
"reward_std": 0.24568770825862885, |
|
"rewards/equation_reward_func": 0.3046875062864274, |
|
"rewards/format_reward_func": 0.9635416828095913, |
|
"step": 110 |
|
}, |
|
{ |
|
"completion_length": 409.60417556762695, |
|
"epoch": 0.05973333333333333, |
|
"grad_norm": 0.10271913225077348, |
|
"kl": 0.0924072265625, |
|
"learning_rate": 4.402184317891501e-07, |
|
"loss": 0.0001, |
|
"reward": 1.2812500335276127, |
|
"reward_std": 0.33530174382030964, |
|
"rewards/equation_reward_func": 0.3385416748933494, |
|
"rewards/format_reward_func": 0.9427083544433117, |
|
"step": 112 |
|
}, |
|
{ |
|
"completion_length": 416.4088649749756, |
|
"epoch": 0.0608, |
|
"grad_norm": 0.08166810576477633, |
|
"kl": 0.095794677734375, |
|
"learning_rate": 4.37860940737443e-07, |
|
"loss": 0.0001, |
|
"reward": 1.1770833805203438, |
|
"reward_std": 0.26351519441232085, |
|
"rewards/equation_reward_func": 0.22395834187045693, |
|
"rewards/format_reward_func": 0.9531250149011612, |
|
"step": 114 |
|
}, |
|
{ |
|
"completion_length": 390.3463611602783, |
|
"epoch": 0.06186666666666667, |
|
"grad_norm": 0.09414353563065953, |
|
"kl": 0.11090087890625, |
|
"learning_rate": 4.354644361119671e-07, |
|
"loss": 0.0001, |
|
"reward": 1.398437537252903, |
|
"reward_std": 0.30470984475687146, |
|
"rewards/equation_reward_func": 0.42187501583248377, |
|
"rewards/format_reward_func": 0.9765625149011612, |
|
"step": 116 |
|
}, |
|
{ |
|
"completion_length": 378.01563358306885, |
|
"epoch": 0.06293333333333333, |
|
"grad_norm": 0.07635029320541607, |
|
"kl": 0.124725341796875, |
|
"learning_rate": 4.3302941560111716e-07, |
|
"loss": 0.0001, |
|
"reward": 1.3958333730697632, |
|
"reward_std": 0.36394598754122853, |
|
"rewards/equation_reward_func": 0.4166666765231639, |
|
"rewards/format_reward_func": 0.9791666865348816, |
|
"step": 118 |
|
}, |
|
{ |
|
"completion_length": 392.65625953674316, |
|
"epoch": 0.064, |
|
"grad_norm": 0.0833024147650861, |
|
"kl": 0.1026611328125, |
|
"learning_rate": 4.3055638489198236e-07, |
|
"loss": 0.0001, |
|
"reward": 1.3359375298023224, |
|
"reward_std": 0.37286510691046715, |
|
"rewards/equation_reward_func": 0.3906250102445483, |
|
"rewards/format_reward_func": 0.9453125186264515, |
|
"step": 120 |
|
}, |
|
{ |
|
"completion_length": 399.5078182220459, |
|
"epoch": 0.06506666666666666, |
|
"grad_norm": 0.0892199212165042, |
|
"kl": 0.1014404296875, |
|
"learning_rate": 4.280458575653296e-07, |
|
"loss": 0.0001, |
|
"reward": 1.3307292088866234, |
|
"reward_std": 0.3504871279001236, |
|
"rewards/equation_reward_func": 0.38802084513008595, |
|
"rewards/format_reward_func": 0.9427083507180214, |
|
"step": 122 |
|
}, |
|
{ |
|
"completion_length": 450.1354331970215, |
|
"epoch": 0.06613333333333334, |
|
"grad_norm": 0.06581923430481687, |
|
"kl": 0.114990234375, |
|
"learning_rate": 4.2549835498894665e-07, |
|
"loss": 0.0001, |
|
"reward": 1.2604166939854622, |
|
"reward_std": 0.3068140549585223, |
|
"rewards/equation_reward_func": 0.32552084559574723, |
|
"rewards/format_reward_func": 0.9348958544433117, |
|
"step": 124 |
|
}, |
|
{ |
|
"completion_length": 390.036470413208, |
|
"epoch": 0.0672, |
|
"grad_norm": 0.07114986931726634, |
|
"kl": 0.10528564453125, |
|
"learning_rate": 4.229144062093679e-07, |
|
"loss": 0.0001, |
|
"reward": 1.3723958730697632, |
|
"reward_std": 0.29870040342211723, |
|
"rewards/equation_reward_func": 0.39843751303851604, |
|
"rewards/format_reward_func": 0.9739583469927311, |
|
"step": 126 |
|
}, |
|
{ |
|
"completion_length": 392.59115505218506, |
|
"epoch": 0.06826666666666667, |
|
"grad_norm": 0.0877107079994648, |
|
"kl": 0.109405517578125, |
|
"learning_rate": 4.2029454784200675e-07, |
|
"loss": 0.0001, |
|
"reward": 1.390625037252903, |
|
"reward_std": 0.280646042432636, |
|
"rewards/equation_reward_func": 0.42447917722165585, |
|
"rewards/format_reward_func": 0.9661458507180214, |
|
"step": 128 |
|
}, |
|
{ |
|
"completion_length": 421.0078191757202, |
|
"epoch": 0.06933333333333333, |
|
"grad_norm": 0.09643905280459295, |
|
"kl": 0.10009765625, |
|
"learning_rate": 4.1763932395971433e-07, |
|
"loss": 0.0001, |
|
"reward": 1.2942708693444729, |
|
"reward_std": 0.3986189612187445, |
|
"rewards/equation_reward_func": 0.3567708421032876, |
|
"rewards/format_reward_func": 0.9375000223517418, |
|
"step": 130 |
|
}, |
|
{ |
|
"completion_length": 461.4791736602783, |
|
"epoch": 0.0704, |
|
"grad_norm": 0.06366382823979087, |
|
"kl": 0.101837158203125, |
|
"learning_rate": 4.1494928597979117e-07, |
|
"loss": 0.0001, |
|
"reward": 1.2760417088866234, |
|
"reward_std": 0.27500381181016564, |
|
"rewards/equation_reward_func": 0.32291667629033327, |
|
"rewards/format_reward_func": 0.9531250186264515, |
|
"step": 132 |
|
}, |
|
{ |
|
"completion_length": 394.3255319595337, |
|
"epoch": 0.07146666666666666, |
|
"grad_norm": 0.0853911421540347, |
|
"kl": 0.130126953125, |
|
"learning_rate": 4.122249925494726e-07, |
|
"loss": 0.0001, |
|
"reward": 1.403645858168602, |
|
"reward_std": 0.25308565702289343, |
|
"rewards/equation_reward_func": 0.43229168001562357, |
|
"rewards/format_reward_func": 0.9713541828095913, |
|
"step": 134 |
|
}, |
|
{ |
|
"completion_length": 426.466157913208, |
|
"epoch": 0.07253333333333334, |
|
"grad_norm": 0.0692987274556644, |
|
"kl": 0.11669921875, |
|
"learning_rate": 4.094670094299131e-07, |
|
"loss": 0.0001, |
|
"reward": 1.281250037252903, |
|
"reward_std": 0.316250397823751, |
|
"rewards/equation_reward_func": 0.33072917559184134, |
|
"rewards/format_reward_func": 0.9505208469927311, |
|
"step": 136 |
|
}, |
|
{ |
|
"completion_length": 439.9010543823242, |
|
"epoch": 0.0736, |
|
"grad_norm": 0.08170658774133101, |
|
"kl": 0.104949951171875, |
|
"learning_rate": 4.066759093786931e-07, |
|
"loss": 0.0001, |
|
"reward": 1.2760416977107525, |
|
"reward_std": 0.35973797645419836, |
|
"rewards/equation_reward_func": 0.3411458428017795, |
|
"rewards/format_reward_func": 0.9348958618938923, |
|
"step": 138 |
|
}, |
|
{ |
|
"completion_length": 416.6406354904175, |
|
"epoch": 0.07466666666666667, |
|
"grad_norm": 0.06667199771271264, |
|
"kl": 0.12353515625, |
|
"learning_rate": 4.038522720308732e-07, |
|
"loss": 0.0001, |
|
"reward": 1.3854167088866234, |
|
"reward_std": 0.21267010737210512, |
|
"rewards/equation_reward_func": 0.4088541741948575, |
|
"rewards/format_reward_func": 0.9765625149011612, |
|
"step": 140 |
|
}, |
|
{ |
|
"completion_length": 440.536470413208, |
|
"epoch": 0.07573333333333333, |
|
"grad_norm": 0.0850091968151683, |
|
"kl": 0.11474609375, |
|
"learning_rate": 4.009966837786194e-07, |
|
"loss": 0.0001, |
|
"reward": 1.3255208693444729, |
|
"reward_std": 0.30754279950633645, |
|
"rewards/equation_reward_func": 0.36458334629423916, |
|
"rewards/format_reward_func": 0.9609375074505806, |
|
"step": 142 |
|
}, |
|
{ |
|
"completion_length": 401.09896659851074, |
|
"epoch": 0.0768, |
|
"grad_norm": 0.0768836508261685, |
|
"kl": 0.116485595703125, |
|
"learning_rate": 3.981097376494259e-07, |
|
"loss": 0.0001, |
|
"reward": 1.4557292237877846, |
|
"reward_std": 0.31219895882532, |
|
"rewards/equation_reward_func": 0.5026041828095913, |
|
"rewards/format_reward_func": 0.9531250186264515, |
|
"step": 144 |
|
}, |
|
{ |
|
"completion_length": 461.122407913208, |
|
"epoch": 0.07786666666666667, |
|
"grad_norm": 0.10600525349484782, |
|
"kl": 0.114044189453125, |
|
"learning_rate": 3.951920331829592e-07, |
|
"loss": 0.0001, |
|
"reward": 1.2890625335276127, |
|
"reward_std": 0.2976598385721445, |
|
"rewards/equation_reward_func": 0.3255208437331021, |
|
"rewards/format_reward_func": 0.963541679084301, |
|
"step": 146 |
|
}, |
|
{ |
|
"completion_length": 413.9713649749756, |
|
"epoch": 0.07893333333333333, |
|
"grad_norm": 0.0723392326431143, |
|
"kl": 0.123321533203125, |
|
"learning_rate": 3.922441763065506e-07, |
|
"loss": 0.0001, |
|
"reward": 1.4088542014360428, |
|
"reward_std": 0.23494611913338304, |
|
"rewards/equation_reward_func": 0.43750000931322575, |
|
"rewards/format_reward_func": 0.9713541828095913, |
|
"step": 148 |
|
}, |
|
{ |
|
"completion_length": 458.7604331970215, |
|
"epoch": 0.08, |
|
"grad_norm": 0.09048584328529992, |
|
"kl": 0.12384033203125, |
|
"learning_rate": 3.8926677920936093e-07, |
|
"loss": 0.0001, |
|
"reward": 1.2656250223517418, |
|
"reward_std": 0.3245450472459197, |
|
"rewards/equation_reward_func": 0.32031250838190317, |
|
"rewards/format_reward_func": 0.9453125111758709, |
|
"step": 150 |
|
}, |
|
{ |
|
"completion_length": 378.33073902130127, |
|
"epoch": 0.08106666666666666, |
|
"grad_norm": 0.10455674533718096, |
|
"kl": 0.13720703125, |
|
"learning_rate": 3.862604602152464e-07, |
|
"loss": 0.0001, |
|
"reward": 1.4244792014360428, |
|
"reward_std": 0.26624298514798284, |
|
"rewards/equation_reward_func": 0.46093751210719347, |
|
"rewards/format_reward_func": 0.9635416828095913, |
|
"step": 152 |
|
}, |
|
{ |
|
"completion_length": 421.9140796661377, |
|
"epoch": 0.08213333333333334, |
|
"grad_norm": 0.10103705731464013, |
|
"kl": 0.129638671875, |
|
"learning_rate": 3.8322584365434934e-07, |
|
"loss": 0.0001, |
|
"reward": 1.3255208879709244, |
|
"reward_std": 0.24930242728441954, |
|
"rewards/equation_reward_func": 0.3723958465270698, |
|
"rewards/format_reward_func": 0.9531250186264515, |
|
"step": 154 |
|
}, |
|
{ |
|
"completion_length": 463.58334159851074, |
|
"epoch": 0.0832, |
|
"grad_norm": 0.09221432956401719, |
|
"kl": 0.127166748046875, |
|
"learning_rate": 3.8016355973344173e-07, |
|
"loss": 0.0001, |
|
"reward": 1.234375037252903, |
|
"reward_std": 0.2910663695074618, |
|
"rewards/equation_reward_func": 0.28906250605359674, |
|
"rewards/format_reward_func": 0.9453125260770321, |
|
"step": 156 |
|
}, |
|
{ |
|
"completion_length": 419.51563835144043, |
|
"epoch": 0.08426666666666667, |
|
"grad_norm": 0.08138226453807305, |
|
"kl": 0.1285400390625, |
|
"learning_rate": 3.7707424440504863e-07, |
|
"loss": 0.0001, |
|
"reward": 1.3489583730697632, |
|
"reward_std": 0.23599386168643832, |
|
"rewards/equation_reward_func": 0.37500001094304025, |
|
"rewards/format_reward_func": 0.9739583544433117, |
|
"step": 158 |
|
}, |
|
{ |
|
"completion_length": 361.1015729904175, |
|
"epoch": 0.08533333333333333, |
|
"grad_norm": 0.13299459818559828, |
|
"kl": 0.15423583984375, |
|
"learning_rate": 3.739585392353787e-07, |
|
"loss": 0.0002, |
|
"reward": 1.434895858168602, |
|
"reward_std": 0.28986221412196755, |
|
"rewards/equation_reward_func": 0.458333341171965, |
|
"rewards/format_reward_func": 0.9765625186264515, |
|
"step": 160 |
|
}, |
|
{ |
|
"completion_length": 391.0599117279053, |
|
"epoch": 0.0864, |
|
"grad_norm": 0.10062549742509476, |
|
"kl": 0.140045166015625, |
|
"learning_rate": 3.7081709127108767e-07, |
|
"loss": 0.0001, |
|
"reward": 1.4244791939854622, |
|
"reward_std": 0.2554763099178672, |
|
"rewards/equation_reward_func": 0.4427083439659327, |
|
"rewards/format_reward_func": 0.9817708469927311, |
|
"step": 162 |
|
}, |
|
{ |
|
"completion_length": 346.71094512939453, |
|
"epoch": 0.08746666666666666, |
|
"grad_norm": 0.07557865430106443, |
|
"kl": 0.165771484375, |
|
"learning_rate": 3.6765055290490513e-07, |
|
"loss": 0.0002, |
|
"reward": 1.510416716337204, |
|
"reward_std": 0.23889524163678288, |
|
"rewards/equation_reward_func": 0.5390625223517418, |
|
"rewards/format_reward_func": 0.9713541828095913, |
|
"step": 164 |
|
}, |
|
{ |
|
"completion_length": 374.559907913208, |
|
"epoch": 0.08853333333333334, |
|
"grad_norm": 0.09484161296330915, |
|
"kl": 0.145050048828125, |
|
"learning_rate": 3.644595817401501e-07, |
|
"loss": 0.0001, |
|
"reward": 1.4140625596046448, |
|
"reward_std": 0.2526052575558424, |
|
"rewards/equation_reward_func": 0.43229167931713164, |
|
"rewards/format_reward_func": 0.9817708507180214, |
|
"step": 166 |
|
}, |
|
{ |
|
"completion_length": 389.614595413208, |
|
"epoch": 0.0896, |
|
"grad_norm": 0.10850466477020716, |
|
"kl": 0.140869140625, |
|
"learning_rate": 3.6124484045416483e-07, |
|
"loss": 0.0001, |
|
"reward": 1.3411458730697632, |
|
"reward_std": 0.20541261043399572, |
|
"rewards/equation_reward_func": 0.3515625118743628, |
|
"rewards/format_reward_func": 0.9895833432674408, |
|
"step": 168 |
|
}, |
|
{ |
|
"completion_length": 314.33594512939453, |
|
"epoch": 0.09066666666666667, |
|
"grad_norm": 0.09160402552556286, |
|
"kl": 0.159759521484375, |
|
"learning_rate": 3.580069966606949e-07, |
|
"loss": 0.0002, |
|
"reward": 1.4739583805203438, |
|
"reward_std": 0.2342346585355699, |
|
"rewards/equation_reward_func": 0.5000000114087015, |
|
"rewards/format_reward_func": 0.9739583469927311, |
|
"step": 170 |
|
}, |
|
{ |
|
"completion_length": 359.8906354904175, |
|
"epoch": 0.09173333333333333, |
|
"grad_norm": 0.09610423165466968, |
|
"kl": 0.154388427734375, |
|
"learning_rate": 3.547467227712444e-07, |
|
"loss": 0.0002, |
|
"reward": 1.437500037252903, |
|
"reward_std": 0.1834291499108076, |
|
"rewards/equation_reward_func": 0.45572918467223644, |
|
"rewards/format_reward_func": 0.9817708469927311, |
|
"step": 172 |
|
}, |
|
{ |
|
"completion_length": 365.7343854904175, |
|
"epoch": 0.0928, |
|
"grad_norm": 0.09889865100739882, |
|
"kl": 0.15478515625, |
|
"learning_rate": 3.5146469585543386e-07, |
|
"loss": 0.0002, |
|
"reward": 1.414062537252903, |
|
"reward_std": 0.19458062946796417, |
|
"rewards/equation_reward_func": 0.4322916797827929, |
|
"rewards/format_reward_func": 0.9817708432674408, |
|
"step": 174 |
|
}, |
|
{ |
|
"completion_length": 348.29427909851074, |
|
"epoch": 0.09386666666666667, |
|
"grad_norm": 0.09367098793216834, |
|
"kl": 0.159515380859375, |
|
"learning_rate": 3.481615975003922e-07, |
|
"loss": 0.0002, |
|
"reward": 1.4739583879709244, |
|
"reward_std": 0.15797653933987021, |
|
"rewards/equation_reward_func": 0.4921875149011612, |
|
"rewards/format_reward_func": 0.9817708469927311, |
|
"step": 176 |
|
}, |
|
{ |
|
"completion_length": 357.6145963668823, |
|
"epoch": 0.09493333333333333, |
|
"grad_norm": 0.07894542256229298, |
|
"kl": 0.150299072265625, |
|
"learning_rate": 3.448381136692089e-07, |
|
"loss": 0.0002, |
|
"reward": 1.4401042126119137, |
|
"reward_std": 0.2548735234886408, |
|
"rewards/equation_reward_func": 0.4765625074505806, |
|
"rewards/format_reward_func": 0.9635416828095913, |
|
"step": 178 |
|
}, |
|
{ |
|
"completion_length": 353.8281354904175, |
|
"epoch": 0.096, |
|
"grad_norm": 0.10120368862706956, |
|
"kl": 0.1510009765625, |
|
"learning_rate": 3.4149493455847897e-07, |
|
"loss": 0.0002, |
|
"reward": 1.377604216337204, |
|
"reward_std": 0.17720257258042693, |
|
"rewards/equation_reward_func": 0.3932291795499623, |
|
"rewards/format_reward_func": 0.9843750074505806, |
|
"step": 180 |
|
}, |
|
{ |
|
"completion_length": 337.3437547683716, |
|
"epoch": 0.09706666666666666, |
|
"grad_norm": 0.06857743257585827, |
|
"kl": 0.171661376953125, |
|
"learning_rate": 3.3813275445496766e-07, |
|
"loss": 0.0002, |
|
"reward": 1.3958333879709244, |
|
"reward_std": 0.216899492777884, |
|
"rewards/equation_reward_func": 0.4140625144354999, |
|
"rewards/format_reward_func": 0.9817708469927311, |
|
"step": 182 |
|
}, |
|
{ |
|
"completion_length": 373.3619899749756, |
|
"epoch": 0.09813333333333334, |
|
"grad_norm": 0.07039499292151902, |
|
"kl": 0.185546875, |
|
"learning_rate": 3.347522715914262e-07, |
|
"loss": 0.0002, |
|
"reward": 1.2838542088866234, |
|
"reward_std": 0.14952099742367864, |
|
"rewards/equation_reward_func": 0.29947917186655104, |
|
"rewards/format_reward_func": 0.9843750037252903, |
|
"step": 184 |
|
}, |
|
{ |
|
"completion_length": 343.94011783599854, |
|
"epoch": 0.0992, |
|
"grad_norm": 0.07557857715641425, |
|
"kl": 0.172119140625, |
|
"learning_rate": 3.313541880015877e-07, |
|
"loss": 0.0002, |
|
"reward": 1.3671875521540642, |
|
"reward_std": 0.15858241729438305, |
|
"rewards/equation_reward_func": 0.380208347691223, |
|
"rewards/format_reward_func": 0.986979179084301, |
|
"step": 186 |
|
}, |
|
{ |
|
"completion_length": 360.5390748977661, |
|
"epoch": 0.10026666666666667, |
|
"grad_norm": 0.11214755840839478, |
|
"kl": 0.213623046875, |
|
"learning_rate": 3.279392093743747e-07, |
|
"loss": 0.0002, |
|
"reward": 1.3880208730697632, |
|
"reward_std": 0.19066602177917957, |
|
"rewards/equation_reward_func": 0.4010416786186397, |
|
"rewards/format_reward_func": 0.9869791753590107, |
|
"step": 188 |
|
}, |
|
{ |
|
"completion_length": 322.72397232055664, |
|
"epoch": 0.10133333333333333, |
|
"grad_norm": 0.09281179127833081, |
|
"kl": 0.2625732421875, |
|
"learning_rate": 3.245080449073459e-07, |
|
"loss": 0.0003, |
|
"reward": 1.4557292088866234, |
|
"reward_std": 0.21278624143451452, |
|
"rewards/equation_reward_func": 0.4765625102445483, |
|
"rewards/format_reward_func": 0.9791666753590107, |
|
"step": 190 |
|
}, |
|
{ |
|
"completion_length": 337.25261306762695, |
|
"epoch": 0.1024, |
|
"grad_norm": 0.12372831662094742, |
|
"kl": 0.18206787109375, |
|
"learning_rate": 3.210614071594162e-07, |
|
"loss": 0.0002, |
|
"reward": 1.4218750447034836, |
|
"reward_std": 0.21987988194450736, |
|
"rewards/equation_reward_func": 0.440104179084301, |
|
"rewards/format_reward_func": 0.9817708395421505, |
|
"step": 192 |
|
}, |
|
{ |
|
"completion_length": 344.55729961395264, |
|
"epoch": 0.10346666666666667, |
|
"grad_norm": 0.06378441341807725, |
|
"kl": 0.167724609375, |
|
"learning_rate": 3.1760001190287695e-07, |
|
"loss": 0.0002, |
|
"reward": 1.351562537252903, |
|
"reward_std": 0.14025551918894053, |
|
"rewards/equation_reward_func": 0.36979167722165585, |
|
"rewards/format_reward_func": 0.9817708432674408, |
|
"step": 194 |
|
}, |
|
{ |
|
"completion_length": 314.03646659851074, |
|
"epoch": 0.10453333333333334, |
|
"grad_norm": 0.09003427985578723, |
|
"kl": 0.17474365234375, |
|
"learning_rate": 3.141245779747502e-07, |
|
"loss": 0.0002, |
|
"reward": 1.4479167237877846, |
|
"reward_std": 0.2472039177082479, |
|
"rewards/equation_reward_func": 0.46875001722946763, |
|
"rewards/format_reward_func": 0.979166679084301, |
|
"step": 196 |
|
}, |
|
{ |
|
"completion_length": 282.7083406448364, |
|
"epoch": 0.1056, |
|
"grad_norm": 0.1290749910466798, |
|
"kl": 0.21856689453125, |
|
"learning_rate": 3.106358271275056e-07, |
|
"loss": 0.0002, |
|
"reward": 1.476562537252903, |
|
"reward_std": 0.1649267366155982, |
|
"rewards/equation_reward_func": 0.4843750186264515, |
|
"rewards/format_reward_func": 0.9921875074505806, |
|
"step": 198 |
|
}, |
|
{ |
|
"completion_length": 276.01823806762695, |
|
"epoch": 0.10666666666666667, |
|
"grad_norm": 0.08477695235189277, |
|
"kl": 0.1934814453125, |
|
"learning_rate": 3.0713448387917227e-07, |
|
"loss": 0.0002, |
|
"reward": 1.5468750298023224, |
|
"reward_std": 0.13912134431302547, |
|
"rewards/equation_reward_func": 0.552083348389715, |
|
"rewards/format_reward_func": 0.9947916716337204, |
|
"step": 200 |
|
}, |
|
{ |
|
"completion_length": 317.55730152130127, |
|
"epoch": 0.10773333333333333, |
|
"grad_norm": 2.4369008488049477, |
|
"kl": 5.66986083984375, |
|
"learning_rate": 3.0362127536287636e-07, |
|
"loss": 0.0057, |
|
"reward": 1.421875037252903, |
|
"reward_std": 0.16129080019891262, |
|
"rewards/equation_reward_func": 0.4270833423361182, |
|
"rewards/format_reward_func": 0.9947916679084301, |
|
"step": 202 |
|
}, |
|
{ |
|
"completion_length": 321.65625762939453, |
|
"epoch": 0.1088, |
|
"grad_norm": 0.09396431699981035, |
|
"kl": 0.18658447265625, |
|
"learning_rate": 3.0009693117583523e-07, |
|
"loss": 0.0002, |
|
"reward": 1.4348958656191826, |
|
"reward_std": 0.19856942351907492, |
|
"rewards/equation_reward_func": 0.453125013737008, |
|
"rewards/format_reward_func": 0.9817708469927311, |
|
"step": 204 |
|
}, |
|
{ |
|
"completion_length": 330.83855056762695, |
|
"epoch": 0.10986666666666667, |
|
"grad_norm": 0.07687252722110068, |
|
"kl": 0.1839599609375, |
|
"learning_rate": 2.965621832278401e-07, |
|
"loss": 0.0002, |
|
"reward": 1.377604216337204, |
|
"reward_std": 0.15589443547651172, |
|
"rewards/equation_reward_func": 0.39583334885537624, |
|
"rewards/format_reward_func": 0.9817708469927311, |
|
"step": 206 |
|
}, |
|
{ |
|
"completion_length": 328.23438358306885, |
|
"epoch": 0.11093333333333333, |
|
"grad_norm": 0.0810844061250071, |
|
"kl": 0.1962890625, |
|
"learning_rate": 2.9301776558925875e-07, |
|
"loss": 0.0002, |
|
"reward": 1.3697917014360428, |
|
"reward_std": 0.20208620419725776, |
|
"rewards/equation_reward_func": 0.4036458465270698, |
|
"rewards/format_reward_func": 0.9661458544433117, |
|
"step": 208 |
|
}, |
|
{ |
|
"completion_length": 313.95313262939453, |
|
"epoch": 0.112, |
|
"grad_norm": 0.0851816615508796, |
|
"kl": 0.23468017578125, |
|
"learning_rate": 2.894644143385885e-07, |
|
"loss": 0.0002, |
|
"reward": 1.3958333767950535, |
|
"reward_std": 0.18581857532262802, |
|
"rewards/equation_reward_func": 0.4244791748933494, |
|
"rewards/format_reward_func": 0.971354179084301, |
|
"step": 210 |
|
}, |
|
{ |
|
"completion_length": 326.42448806762695, |
|
"epoch": 0.11306666666666666, |
|
"grad_norm": 0.0786620471083819, |
|
"kl": 0.19378662109375, |
|
"learning_rate": 2.859028674095937e-07, |
|
"loss": 0.0002, |
|
"reward": 1.4010417014360428, |
|
"reward_std": 0.1981433075852692, |
|
"rewards/equation_reward_func": 0.42708334513008595, |
|
"rewards/format_reward_func": 0.9739583432674408, |
|
"step": 212 |
|
}, |
|
{ |
|
"completion_length": 308.4192781448364, |
|
"epoch": 0.11413333333333334, |
|
"grad_norm": 0.08352066179848143, |
|
"kl": 0.189453125, |
|
"learning_rate": 2.823338644380566e-07, |
|
"loss": 0.0002, |
|
"reward": 1.4401042126119137, |
|
"reward_std": 0.2307603359222412, |
|
"rewards/equation_reward_func": 0.47135418048128486, |
|
"rewards/format_reward_func": 0.9687500149011612, |
|
"step": 214 |
|
}, |
|
{ |
|
"completion_length": 341.86198711395264, |
|
"epoch": 0.1152, |
|
"grad_norm": 0.10334760188864624, |
|
"kl": 0.22479248046875, |
|
"learning_rate": 2.7875814660817504e-07, |
|
"loss": 0.0002, |
|
"reward": 1.3880208693444729, |
|
"reward_std": 0.2630339222960174, |
|
"rewards/equation_reward_func": 0.4244791779201478, |
|
"rewards/format_reward_func": 0.9635416828095913, |
|
"step": 216 |
|
}, |
|
{ |
|
"completion_length": 390.3906364440918, |
|
"epoch": 0.11626666666666667, |
|
"grad_norm": 0.1090470945421399, |
|
"kl": 0.2252197265625, |
|
"learning_rate": 2.751764564986396e-07, |
|
"loss": 0.0002, |
|
"reward": 1.223958358168602, |
|
"reward_std": 0.23174711503088474, |
|
"rewards/equation_reward_func": 0.27864584047347307, |
|
"rewards/format_reward_func": 0.9453125223517418, |
|
"step": 218 |
|
}, |
|
{ |
|
"completion_length": 329.63282585144043, |
|
"epoch": 0.11733333333333333, |
|
"grad_norm": 0.10077949546695844, |
|
"kl": 0.2005615234375, |
|
"learning_rate": 2.715895379284194e-07, |
|
"loss": 0.0002, |
|
"reward": 1.3958333730697632, |
|
"reward_std": 0.26168868225067854, |
|
"rewards/equation_reward_func": 0.4427083469927311, |
|
"rewards/format_reward_func": 0.9531250223517418, |
|
"step": 220 |
|
}, |
|
{ |
|
"completion_length": 358.4375104904175, |
|
"epoch": 0.1184, |
|
"grad_norm": 0.08964140632655672, |
|
"kl": 0.21527099609375, |
|
"learning_rate": 2.6799813580229174e-07, |
|
"loss": 0.0002, |
|
"reward": 1.3593750447034836, |
|
"reward_std": 0.25906086526811123, |
|
"rewards/equation_reward_func": 0.4036458386108279, |
|
"rewards/format_reward_func": 0.9557291902601719, |
|
"step": 222 |
|
}, |
|
{ |
|
"completion_length": 343.70834255218506, |
|
"epoch": 0.11946666666666667, |
|
"grad_norm": 0.07620045097589506, |
|
"kl": 0.19964599609375, |
|
"learning_rate": 2.6440299595614606e-07, |
|
"loss": 0.0002, |
|
"reward": 1.3307291939854622, |
|
"reward_std": 0.2277261232957244, |
|
"rewards/equation_reward_func": 0.3697916753590107, |
|
"rewards/format_reward_func": 0.9609375223517418, |
|
"step": 224 |
|
}, |
|
{ |
|
"completion_length": 343.32032108306885, |
|
"epoch": 0.12053333333333334, |
|
"grad_norm": 0.08700892029776192, |
|
"kl": 0.2109375, |
|
"learning_rate": 2.6080486500209347e-07, |
|
"loss": 0.0002, |
|
"reward": 1.3541667014360428, |
|
"reward_std": 0.21279292972758412, |
|
"rewards/equation_reward_func": 0.40104167512618005, |
|
"rewards/format_reward_func": 0.9531250149011612, |
|
"step": 226 |
|
}, |
|
{ |
|
"completion_length": 275.88542556762695, |
|
"epoch": 0.1216, |
|
"grad_norm": 0.11035562445594559, |
|
"kl": 0.202880859375, |
|
"learning_rate": 2.572044901734166e-07, |
|
"loss": 0.0002, |
|
"reward": 1.5833333730697632, |
|
"reward_std": 0.25712650874629617, |
|
"rewards/equation_reward_func": 0.6197916865348816, |
|
"rewards/format_reward_func": 0.9635416716337204, |
|
"step": 228 |
|
}, |
|
{ |
|
"completion_length": 314.13542318344116, |
|
"epoch": 0.12266666666666666, |
|
"grad_norm": 0.10200349640738855, |
|
"kl": 0.21881103515625, |
|
"learning_rate": 2.536026191693893e-07, |
|
"loss": 0.0002, |
|
"reward": 1.4505208693444729, |
|
"reward_std": 0.29838538402691483, |
|
"rewards/equation_reward_func": 0.5156250093132257, |
|
"rewards/format_reward_func": 0.9348958544433117, |
|
"step": 230 |
|
}, |
|
{ |
|
"completion_length": 334.46615505218506, |
|
"epoch": 0.12373333333333333, |
|
"grad_norm": 0.104610809797409, |
|
"kl": 0.20111083984375, |
|
"learning_rate": 2.5e-07, |
|
"loss": 0.0002, |
|
"reward": 1.4479166939854622, |
|
"reward_std": 0.32854113075882196, |
|
"rewards/equation_reward_func": 0.4921875128056854, |
|
"rewards/format_reward_func": 0.9557291902601719, |
|
"step": 232 |
|
}, |
|
{ |
|
"completion_length": 348.3463592529297, |
|
"epoch": 0.1248, |
|
"grad_norm": 0.11970668433207705, |
|
"kl": 0.21832275390625, |
|
"learning_rate": 2.4639738083061073e-07, |
|
"loss": 0.0002, |
|
"reward": 1.2968750409781933, |
|
"reward_std": 0.28170605981722474, |
|
"rewards/equation_reward_func": 0.35937500884756446, |
|
"rewards/format_reward_func": 0.9375000149011612, |
|
"step": 234 |
|
}, |
|
{ |
|
"completion_length": 353.0625104904175, |
|
"epoch": 0.12586666666666665, |
|
"grad_norm": 0.11146940462264297, |
|
"kl": 0.2242431640625, |
|
"learning_rate": 2.4279550982658345e-07, |
|
"loss": 0.0002, |
|
"reward": 1.283854205161333, |
|
"reward_std": 0.2352255848236382, |
|
"rewards/equation_reward_func": 0.3359375107102096, |
|
"rewards/format_reward_func": 0.9479166828095913, |
|
"step": 236 |
|
}, |
|
{ |
|
"completion_length": 328.27865982055664, |
|
"epoch": 0.12693333333333334, |
|
"grad_norm": 0.09019157224178884, |
|
"kl": 0.2286376953125, |
|
"learning_rate": 2.3919513499790646e-07, |
|
"loss": 0.0002, |
|
"reward": 1.4036458767950535, |
|
"reward_std": 0.2419091323390603, |
|
"rewards/equation_reward_func": 0.4557291849050671, |
|
"rewards/format_reward_func": 0.947916679084301, |
|
"step": 238 |
|
}, |
|
{ |
|
"completion_length": 292.41146755218506, |
|
"epoch": 0.128, |
|
"grad_norm": 0.11216015195235872, |
|
"kl": 0.20770263671875, |
|
"learning_rate": 2.3559700404385394e-07, |
|
"loss": 0.0002, |
|
"reward": 1.4218750521540642, |
|
"reward_std": 0.214123603887856, |
|
"rewards/equation_reward_func": 0.45833334303461015, |
|
"rewards/format_reward_func": 0.963541679084301, |
|
"step": 240 |
|
}, |
|
{ |
|
"completion_length": 289.44011306762695, |
|
"epoch": 0.12906666666666666, |
|
"grad_norm": 0.06748907528166415, |
|
"kl": 0.21905517578125, |
|
"learning_rate": 2.3200186419770823e-07, |
|
"loss": 0.0002, |
|
"reward": 1.4973958730697632, |
|
"reward_std": 0.21762575302273035, |
|
"rewards/equation_reward_func": 0.5390625144354999, |
|
"rewards/format_reward_func": 0.9583333544433117, |
|
"step": 242 |
|
}, |
|
{ |
|
"completion_length": 312.97396659851074, |
|
"epoch": 0.13013333333333332, |
|
"grad_norm": 0.08585761519803439, |
|
"kl": 0.22705078125, |
|
"learning_rate": 2.284104620715807e-07, |
|
"loss": 0.0002, |
|
"reward": 1.382812537252903, |
|
"reward_std": 0.2223974741064012, |
|
"rewards/equation_reward_func": 0.4218750149011612, |
|
"rewards/format_reward_func": 0.9609375186264515, |
|
"step": 244 |
|
}, |
|
{ |
|
"completion_length": 281.0208377838135, |
|
"epoch": 0.1312, |
|
"grad_norm": 0.09243139356469632, |
|
"kl": 0.22540283203125, |
|
"learning_rate": 2.2482354350136043e-07, |
|
"loss": 0.0002, |
|
"reward": 1.4947917088866234, |
|
"reward_std": 0.23069008206948638, |
|
"rewards/equation_reward_func": 0.5208333479240537, |
|
"rewards/format_reward_func": 0.9739583469927311, |
|
"step": 246 |
|
}, |
|
{ |
|
"completion_length": 301.8541736602783, |
|
"epoch": 0.13226666666666667, |
|
"grad_norm": 0.09610646803963738, |
|
"kl": 0.22479248046875, |
|
"learning_rate": 2.2124185339182496e-07, |
|
"loss": 0.0002, |
|
"reward": 1.3802083730697632, |
|
"reward_std": 0.1815217286348343, |
|
"rewards/equation_reward_func": 0.4270833432674408, |
|
"rewards/format_reward_func": 0.9531250186264515, |
|
"step": 248 |
|
}, |
|
{ |
|
"completion_length": 347.1015729904175, |
|
"epoch": 0.13333333333333333, |
|
"grad_norm": 0.1746681094283612, |
|
"kl": 0.23883056640625, |
|
"learning_rate": 2.1766613556194344e-07, |
|
"loss": 0.0002, |
|
"reward": 1.2213542088866234, |
|
"reward_std": 0.22283816616982222, |
|
"rewards/equation_reward_func": 0.26041667559184134, |
|
"rewards/format_reward_func": 0.9609375149011612, |
|
"step": 250 |
|
}, |
|
{ |
|
"completion_length": 293.10677909851074, |
|
"epoch": 0.1344, |
|
"grad_norm": 0.07969455343084161, |
|
"kl": 0.305419921875, |
|
"learning_rate": 2.1409713259040628e-07, |
|
"loss": 0.0003, |
|
"reward": 1.4114583730697632, |
|
"reward_std": 0.2006126595661044, |
|
"rewards/equation_reward_func": 0.432291679084301, |
|
"rewards/format_reward_func": 0.9791666753590107, |
|
"step": 252 |
|
}, |
|
{ |
|
"completion_length": 295.825532913208, |
|
"epoch": 0.13546666666666668, |
|
"grad_norm": 0.11043203499359036, |
|
"kl": 0.215087890625, |
|
"learning_rate": 2.105355856614115e-07, |
|
"loss": 0.0002, |
|
"reward": 1.4114583730697632, |
|
"reward_std": 0.3007270940579474, |
|
"rewards/equation_reward_func": 0.46354168374091387, |
|
"rewards/format_reward_func": 0.9479166828095913, |
|
"step": 254 |
|
}, |
|
{ |
|
"completion_length": 295.0286531448364, |
|
"epoch": 0.13653333333333334, |
|
"grad_norm": 0.10604018583177363, |
|
"kl": 0.2293701171875, |
|
"learning_rate": 2.069822344107413e-07, |
|
"loss": 0.0002, |
|
"reward": 1.4401042088866234, |
|
"reward_std": 0.16259960131719708, |
|
"rewards/equation_reward_func": 0.46875001303851604, |
|
"rewards/format_reward_func": 0.9713541828095913, |
|
"step": 256 |
|
}, |
|
{ |
|
"completion_length": 312.09897232055664, |
|
"epoch": 0.1376, |
|
"grad_norm": 0.11581309250324548, |
|
"kl": 0.22454833984375, |
|
"learning_rate": 2.034378167721599e-07, |
|
"loss": 0.0002, |
|
"reward": 1.3411458618938923, |
|
"reward_std": 0.31250663055107, |
|
"rewards/equation_reward_func": 0.39322918001562357, |
|
"rewards/format_reward_func": 0.9479166939854622, |
|
"step": 258 |
|
}, |
|
{ |
|
"completion_length": 301.36980152130127, |
|
"epoch": 0.13866666666666666, |
|
"grad_norm": 0.10375800085599599, |
|
"kl": 0.24493408203125, |
|
"learning_rate": 1.9990306882416485e-07, |
|
"loss": 0.0002, |
|
"reward": 1.4635416977107525, |
|
"reward_std": 0.2693312247283757, |
|
"rewards/equation_reward_func": 0.5104166809469461, |
|
"rewards/format_reward_func": 0.9531250149011612, |
|
"step": 260 |
|
}, |
|
{ |
|
"completion_length": 314.0885524749756, |
|
"epoch": 0.13973333333333332, |
|
"grad_norm": 0.1087966329523751, |
|
"kl": 0.222900390625, |
|
"learning_rate": 1.9637872463712362e-07, |
|
"loss": 0.0002, |
|
"reward": 1.4062500409781933, |
|
"reward_std": 0.26262100599706173, |
|
"rewards/equation_reward_func": 0.44270834792405367, |
|
"rewards/format_reward_func": 0.963541679084301, |
|
"step": 262 |
|
}, |
|
{ |
|
"completion_length": 281.7239646911621, |
|
"epoch": 0.1408, |
|
"grad_norm": 0.09695420136164315, |
|
"kl": 0.26971435546875, |
|
"learning_rate": 1.9286551612082773e-07, |
|
"loss": 0.0003, |
|
"reward": 1.4479167088866234, |
|
"reward_std": 0.2460037199780345, |
|
"rewards/equation_reward_func": 0.48697918094694614, |
|
"rewards/format_reward_func": 0.9609375149011612, |
|
"step": 264 |
|
}, |
|
{ |
|
"completion_length": 315.37240505218506, |
|
"epoch": 0.14186666666666667, |
|
"grad_norm": 0.10037156483806228, |
|
"kl": 0.24798583984375, |
|
"learning_rate": 1.8936417287249446e-07, |
|
"loss": 0.0002, |
|
"reward": 1.3385417088866234, |
|
"reward_std": 0.2581388554535806, |
|
"rewards/equation_reward_func": 0.39583334466442466, |
|
"rewards/format_reward_func": 0.9427083469927311, |
|
"step": 266 |
|
}, |
|
{ |
|
"completion_length": 334.924485206604, |
|
"epoch": 0.14293333333333333, |
|
"grad_norm": 0.15279355937220046, |
|
"kl": 0.26873779296875, |
|
"learning_rate": 1.8587542202524985e-07, |
|
"loss": 0.0003, |
|
"reward": 1.268229205161333, |
|
"reward_std": 0.28603212209418416, |
|
"rewards/equation_reward_func": 0.3177083428017795, |
|
"rewards/format_reward_func": 0.9505208544433117, |
|
"step": 268 |
|
}, |
|
{ |
|
"completion_length": 292.8906297683716, |
|
"epoch": 0.144, |
|
"grad_norm": 0.09627939797808117, |
|
"kl": 0.25811767578125, |
|
"learning_rate": 1.82399988097123e-07, |
|
"loss": 0.0003, |
|
"reward": 1.3828125335276127, |
|
"reward_std": 0.240143911447376, |
|
"rewards/equation_reward_func": 0.4479166786186397, |
|
"rewards/format_reward_func": 0.9348958544433117, |
|
"step": 270 |
|
}, |
|
{ |
|
"completion_length": 303.7291774749756, |
|
"epoch": 0.14506666666666668, |
|
"grad_norm": 0.09414307623625273, |
|
"kl": 0.25408935546875, |
|
"learning_rate": 1.7893859284058378e-07, |
|
"loss": 0.0003, |
|
"reward": 1.3671875298023224, |
|
"reward_std": 0.24746731435880065, |
|
"rewards/equation_reward_func": 0.4218750111758709, |
|
"rewards/format_reward_func": 0.9453125149011612, |
|
"step": 272 |
|
}, |
|
{ |
|
"completion_length": 276.94792318344116, |
|
"epoch": 0.14613333333333334, |
|
"grad_norm": 0.1299701036522939, |
|
"kl": 0.57366943359375, |
|
"learning_rate": 1.7549195509265407e-07, |
|
"loss": 0.0006, |
|
"reward": 1.4348958730697632, |
|
"reward_std": 0.2572689475491643, |
|
"rewards/equation_reward_func": 0.4791666748933494, |
|
"rewards/format_reward_func": 0.9557291902601719, |
|
"step": 274 |
|
}, |
|
{ |
|
"completion_length": 248.166672706604, |
|
"epoch": 0.1472, |
|
"grad_norm": 0.08206460484425186, |
|
"kl": 0.256103515625, |
|
"learning_rate": 1.7206079062562536e-07, |
|
"loss": 0.0003, |
|
"reward": 1.5833333656191826, |
|
"reward_std": 0.21109008882194757, |
|
"rewards/equation_reward_func": 0.6145833458285779, |
|
"rewards/format_reward_func": 0.9687500260770321, |
|
"step": 276 |
|
}, |
|
{ |
|
"completion_length": 305.27865409851074, |
|
"epoch": 0.14826666666666666, |
|
"grad_norm": 0.10621644156716899, |
|
"kl": 0.2762451171875, |
|
"learning_rate": 1.6864581199841226e-07, |
|
"loss": 0.0003, |
|
"reward": 1.312500026077032, |
|
"reward_std": 0.24705103458836675, |
|
"rewards/equation_reward_func": 0.36718751094304025, |
|
"rewards/format_reward_func": 0.9453125186264515, |
|
"step": 278 |
|
}, |
|
{ |
|
"completion_length": 286.6406297683716, |
|
"epoch": 0.14933333333333335, |
|
"grad_norm": 0.10751127049009096, |
|
"kl": 0.26580810546875, |
|
"learning_rate": 1.6524772840857388e-07, |
|
"loss": 0.0003, |
|
"reward": 1.3072916977107525, |
|
"reward_std": 0.2637113491073251, |
|
"rewards/equation_reward_func": 0.38281250768341124, |
|
"rewards/format_reward_func": 0.9244791865348816, |
|
"step": 280 |
|
}, |
|
{ |
|
"completion_length": 275.32552909851074, |
|
"epoch": 0.1504, |
|
"grad_norm": 0.10203495847611208, |
|
"kl": 0.29620361328125, |
|
"learning_rate": 1.6186724554503237e-07, |
|
"loss": 0.0003, |
|
"reward": 1.4687500409781933, |
|
"reward_std": 0.23805115604773164, |
|
"rewards/equation_reward_func": 0.5156250128056854, |
|
"rewards/format_reward_func": 0.9531250186264515, |
|
"step": 282 |
|
}, |
|
{ |
|
"completion_length": 312.76823806762695, |
|
"epoch": 0.15146666666666667, |
|
"grad_norm": 0.09010102560559675, |
|
"kl": 0.26605224609375, |
|
"learning_rate": 1.5850506544152103e-07, |
|
"loss": 0.0003, |
|
"reward": 1.2786458618938923, |
|
"reward_std": 0.27972705382853746, |
|
"rewards/equation_reward_func": 0.35156250977888703, |
|
"rewards/format_reward_func": 0.9270833432674408, |
|
"step": 284 |
|
}, |
|
{ |
|
"completion_length": 263.97916984558105, |
|
"epoch": 0.15253333333333333, |
|
"grad_norm": 0.09699956880184334, |
|
"kl": 0.271728515625, |
|
"learning_rate": 1.5516188633079107e-07, |
|
"loss": 0.0003, |
|
"reward": 1.4088542088866234, |
|
"reward_std": 0.21715012891218066, |
|
"rewards/equation_reward_func": 0.432291679084301, |
|
"rewards/format_reward_func": 0.9765625186264515, |
|
"step": 286 |
|
}, |
|
{ |
|
"completion_length": 284.93750762939453, |
|
"epoch": 0.1536, |
|
"grad_norm": 0.13730205530993134, |
|
"kl": 0.26202392578125, |
|
"learning_rate": 1.5183840249960784e-07, |
|
"loss": 0.0003, |
|
"reward": 1.2916666977107525, |
|
"reward_std": 0.2690475699491799, |
|
"rewards/equation_reward_func": 0.3411458432674408, |
|
"rewards/format_reward_func": 0.9505208507180214, |
|
"step": 288 |
|
}, |
|
{ |
|
"completion_length": 313.8724036216736, |
|
"epoch": 0.15466666666666667, |
|
"grad_norm": 0.10982987970993405, |
|
"kl": 0.25787353515625, |
|
"learning_rate": 1.4853530414456612e-07, |
|
"loss": 0.0003, |
|
"reward": 1.3359375447034836, |
|
"reward_std": 0.28103851480409503, |
|
"rewards/equation_reward_func": 0.38020834419876337, |
|
"rewards/format_reward_func": 0.9557291865348816, |
|
"step": 290 |
|
}, |
|
{ |
|
"completion_length": 280.13021659851074, |
|
"epoch": 0.15573333333333333, |
|
"grad_norm": 0.10569696273751499, |
|
"kl": 0.2752685546875, |
|
"learning_rate": 1.4525327722875568e-07, |
|
"loss": 0.0003, |
|
"reward": 1.3723958730697632, |
|
"reward_std": 0.253665282856673, |
|
"rewards/equation_reward_func": 0.4270833453629166, |
|
"rewards/format_reward_func": 0.9453125223517418, |
|
"step": 292 |
|
}, |
|
{ |
|
"completion_length": 266.9010486602783, |
|
"epoch": 0.1568, |
|
"grad_norm": 0.1273947740183966, |
|
"kl": 0.2657470703125, |
|
"learning_rate": 1.4199300333930515e-07, |
|
"loss": 0.0003, |
|
"reward": 1.4635417088866234, |
|
"reward_std": 0.28517728950828314, |
|
"rewards/equation_reward_func": 0.5026041837409139, |
|
"rewards/format_reward_func": 0.9609375111758709, |
|
"step": 294 |
|
}, |
|
{ |
|
"completion_length": 328.830735206604, |
|
"epoch": 0.15786666666666666, |
|
"grad_norm": 0.1699855426323704, |
|
"kl": 0.2620849609375, |
|
"learning_rate": 1.3875515954583523e-07, |
|
"loss": 0.0003, |
|
"reward": 1.2187500447034836, |
|
"reward_std": 0.3317327341064811, |
|
"rewards/equation_reward_func": 0.2942708428017795, |
|
"rewards/format_reward_func": 0.9244791902601719, |
|
"step": 296 |
|
}, |
|
{ |
|
"completion_length": 329.12240982055664, |
|
"epoch": 0.15893333333333334, |
|
"grad_norm": 0.14001227147909825, |
|
"kl": 0.27099609375, |
|
"learning_rate": 1.3554041825985e-07, |
|
"loss": 0.0003, |
|
"reward": 1.1979167014360428, |
|
"reward_std": 0.2845407989807427, |
|
"rewards/equation_reward_func": 0.28125000931322575, |
|
"rewards/format_reward_func": 0.9166666865348816, |
|
"step": 298 |
|
}, |
|
{ |
|
"completion_length": 283.0989661216736, |
|
"epoch": 0.16, |
|
"grad_norm": 0.10223346879835553, |
|
"kl": 0.24761962890625, |
|
"learning_rate": 1.323494470950949e-07, |
|
"loss": 0.0002, |
|
"reward": 1.429687537252903, |
|
"reward_std": 0.26960491156205535, |
|
"rewards/equation_reward_func": 0.47135418094694614, |
|
"rewards/format_reward_func": 0.9583333544433117, |
|
"step": 300 |
|
}, |
|
{ |
|
"completion_length": 254.60156726837158, |
|
"epoch": 0.16106666666666666, |
|
"grad_norm": 0.08918786164304986, |
|
"kl": 0.260986328125, |
|
"learning_rate": 1.2918290872891236e-07, |
|
"loss": 0.0003, |
|
"reward": 1.4348958805203438, |
|
"reward_std": 0.15168809751048684, |
|
"rewards/equation_reward_func": 0.458333347691223, |
|
"rewards/format_reward_func": 0.9765625111758709, |
|
"step": 302 |
|
}, |
|
{ |
|
"completion_length": 280.38021516799927, |
|
"epoch": 0.16213333333333332, |
|
"grad_norm": 0.10981016883182508, |
|
"kl": 0.26275634765625, |
|
"learning_rate": 1.260414607646213e-07, |
|
"loss": 0.0003, |
|
"reward": 1.3880208618938923, |
|
"reward_std": 0.2798879165202379, |
|
"rewards/equation_reward_func": 0.42708334303461015, |
|
"rewards/format_reward_func": 0.9609375111758709, |
|
"step": 304 |
|
}, |
|
{ |
|
"completion_length": 236.63802528381348, |
|
"epoch": 0.1632, |
|
"grad_norm": 0.1126860308935798, |
|
"kl": 0.24639892578125, |
|
"learning_rate": 1.2292575559495143e-07, |
|
"loss": 0.0002, |
|
"reward": 1.5338541939854622, |
|
"reward_std": 0.21581484470516443, |
|
"rewards/equation_reward_func": 0.5598958488553762, |
|
"rewards/format_reward_func": 0.9739583507180214, |
|
"step": 306 |
|
}, |
|
{ |
|
"completion_length": 274.51823902130127, |
|
"epoch": 0.16426666666666667, |
|
"grad_norm": 0.11433058952931557, |
|
"kl": 0.2418212890625, |
|
"learning_rate": 1.1983644026655835e-07, |
|
"loss": 0.0002, |
|
"reward": 1.3984375298023224, |
|
"reward_std": 0.2787149855867028, |
|
"rewards/equation_reward_func": 0.4505208458285779, |
|
"rewards/format_reward_func": 0.9479166828095913, |
|
"step": 308 |
|
}, |
|
{ |
|
"completion_length": 269.51303005218506, |
|
"epoch": 0.16533333333333333, |
|
"grad_norm": 0.10900628538932935, |
|
"kl": 0.2515869140625, |
|
"learning_rate": 1.1677415634565066e-07, |
|
"loss": 0.0003, |
|
"reward": 1.4531250298023224, |
|
"reward_std": 0.21808092296123505, |
|
"rewards/equation_reward_func": 0.5026041760575026, |
|
"rewards/format_reward_func": 0.950520858168602, |
|
"step": 310 |
|
}, |
|
{ |
|
"completion_length": 276.29948711395264, |
|
"epoch": 0.1664, |
|
"grad_norm": 0.10471445766441949, |
|
"kl": 0.24822998046875, |
|
"learning_rate": 1.1373953978475353e-07, |
|
"loss": 0.0002, |
|
"reward": 1.4088542014360428, |
|
"reward_std": 0.2563867177814245, |
|
"rewards/equation_reward_func": 0.4557291786186397, |
|
"rewards/format_reward_func": 0.9531250260770321, |
|
"step": 312 |
|
}, |
|
{ |
|
"completion_length": 281.54167652130127, |
|
"epoch": 0.16746666666666668, |
|
"grad_norm": 0.11476171924959432, |
|
"kl": 0.25030517578125, |
|
"learning_rate": 1.1073322079063913e-07, |
|
"loss": 0.0003, |
|
"reward": 1.419270884245634, |
|
"reward_std": 0.2665014350786805, |
|
"rewards/equation_reward_func": 0.46354168374091387, |
|
"rewards/format_reward_func": 0.9557291865348816, |
|
"step": 314 |
|
}, |
|
{ |
|
"completion_length": 282.43490409851074, |
|
"epoch": 0.16853333333333334, |
|
"grad_norm": 0.0841971248428421, |
|
"kl": 0.21875, |
|
"learning_rate": 1.0775582369344946e-07, |
|
"loss": 0.0002, |
|
"reward": 1.424479216337204, |
|
"reward_std": 0.2608643379062414, |
|
"rewards/equation_reward_func": 0.46875001303851604, |
|
"rewards/format_reward_func": 0.9557291902601719, |
|
"step": 316 |
|
}, |
|
{ |
|
"completion_length": 273.4349060058594, |
|
"epoch": 0.1696, |
|
"grad_norm": 0.09515899802774246, |
|
"kl": 0.24407958984375, |
|
"learning_rate": 1.0480796681704077e-07, |
|
"loss": 0.0002, |
|
"reward": 1.4010417088866234, |
|
"reward_std": 0.2546477783471346, |
|
"rewards/equation_reward_func": 0.4453125174622983, |
|
"rewards/format_reward_func": 0.9557291865348816, |
|
"step": 318 |
|
}, |
|
{ |
|
"completion_length": 316.6458435058594, |
|
"epoch": 0.17066666666666666, |
|
"grad_norm": 0.099987410497596, |
|
"kl": 0.23638916015625, |
|
"learning_rate": 1.018902623505741e-07, |
|
"loss": 0.0002, |
|
"reward": 1.2942708656191826, |
|
"reward_std": 0.29723000014200807, |
|
"rewards/equation_reward_func": 0.3671875111758709, |
|
"rewards/format_reward_func": 0.927083358168602, |
|
"step": 320 |
|
}, |
|
{ |
|
"completion_length": 288.893235206604, |
|
"epoch": 0.17173333333333332, |
|
"grad_norm": 0.11246455050265577, |
|
"kl": 0.23480224609375, |
|
"learning_rate": 9.900331622138063e-08, |
|
"loss": 0.0002, |
|
"reward": 1.3723958730697632, |
|
"reward_std": 0.289981079287827, |
|
"rewards/equation_reward_func": 0.4114583421032876, |
|
"rewards/format_reward_func": 0.9609375149011612, |
|
"step": 322 |
|
}, |
|
{ |
|
"completion_length": 264.9921979904175, |
|
"epoch": 0.1728, |
|
"grad_norm": 0.10025221120521255, |
|
"kl": 0.24884033203125, |
|
"learning_rate": 9.614772796912681e-08, |
|
"loss": 0.0002, |
|
"reward": 1.398437537252903, |
|
"reward_std": 0.21295037120580673, |
|
"rewards/equation_reward_func": 0.4296875102445483, |
|
"rewards/format_reward_func": 0.9687500111758709, |
|
"step": 324 |
|
}, |
|
{ |
|
"completion_length": 283.9114661216736, |
|
"epoch": 0.17386666666666667, |
|
"grad_norm": 0.08393060980669469, |
|
"kl": 0.2662353515625, |
|
"learning_rate": 9.332409062130686e-08, |
|
"loss": 0.0003, |
|
"reward": 1.3046875298023224, |
|
"reward_std": 0.211736383382231, |
|
"rewards/equation_reward_func": 0.3437500102445483, |
|
"rewards/format_reward_func": 0.9609375149011612, |
|
"step": 326 |
|
}, |
|
{ |
|
"completion_length": 280.97657108306885, |
|
"epoch": 0.17493333333333333, |
|
"grad_norm": 0.09266235555090595, |
|
"kl": 0.26544189453125, |
|
"learning_rate": 9.053299057008699e-08, |
|
"loss": 0.0003, |
|
"reward": 1.3619792014360428, |
|
"reward_std": 0.18739549908787012, |
|
"rewards/equation_reward_func": 0.40364584513008595, |
|
"rewards/format_reward_func": 0.9583333544433117, |
|
"step": 328 |
|
}, |
|
{ |
|
"completion_length": 266.9140729904175, |
|
"epoch": 0.176, |
|
"grad_norm": 0.17475099073751835, |
|
"kl": 0.24237060546875, |
|
"learning_rate": 8.777500745052743e-08, |
|
"loss": 0.0002, |
|
"reward": 1.4192708879709244, |
|
"reward_std": 0.2251653028652072, |
|
"rewards/equation_reward_func": 0.45312501629814506, |
|
"rewards/format_reward_func": 0.9661458544433117, |
|
"step": 330 |
|
}, |
|
{ |
|
"completion_length": 281.2734489440918, |
|
"epoch": 0.17706666666666668, |
|
"grad_norm": 0.11185068411943261, |
|
"kl": 0.24456787109375, |
|
"learning_rate": 8.505071402020892e-08, |
|
"loss": 0.0002, |
|
"reward": 1.393229216337204, |
|
"reward_std": 0.2644071178510785, |
|
"rewards/equation_reward_func": 0.4453125111758709, |
|
"rewards/format_reward_func": 0.947916679084301, |
|
"step": 332 |
|
}, |
|
{ |
|
"completion_length": 283.9192819595337, |
|
"epoch": 0.17813333333333334, |
|
"grad_norm": 0.14116520705594282, |
|
"kl": 0.2410888671875, |
|
"learning_rate": 8.236067604028562e-08, |
|
"loss": 0.0002, |
|
"reward": 1.3723958656191826, |
|
"reward_std": 0.2818891149945557, |
|
"rewards/equation_reward_func": 0.41406250931322575, |
|
"rewards/format_reward_func": 0.9583333544433117, |
|
"step": 334 |
|
}, |
|
{ |
|
"completion_length": 260.65104579925537, |
|
"epoch": 0.1792, |
|
"grad_norm": 0.1336225513443869, |
|
"kl": 0.239501953125, |
|
"learning_rate": 7.970545215799327e-08, |
|
"loss": 0.0002, |
|
"reward": 1.4869791939854622, |
|
"reward_std": 0.28690007980912924, |
|
"rewards/equation_reward_func": 0.5390625186264515, |
|
"rewards/format_reward_func": 0.9479166902601719, |
|
"step": 336 |
|
}, |
|
{ |
|
"completion_length": 248.3099012374878, |
|
"epoch": 0.18026666666666666, |
|
"grad_norm": 0.09856720056681173, |
|
"kl": 0.23907470703125, |
|
"learning_rate": 7.708559379063204e-08, |
|
"loss": 0.0002, |
|
"reward": 1.4817708656191826, |
|
"reward_std": 0.23133338056504726, |
|
"rewards/equation_reward_func": 0.5260416767559946, |
|
"rewards/format_reward_func": 0.9557291902601719, |
|
"step": 338 |
|
}, |
|
{ |
|
"completion_length": 290.5052146911621, |
|
"epoch": 0.18133333333333335, |
|
"grad_norm": 0.1122615481772805, |
|
"kl": 0.24444580078125, |
|
"learning_rate": 7.45016450110534e-08, |
|
"loss": 0.0002, |
|
"reward": 1.2838542014360428, |
|
"reward_std": 0.2104581743478775, |
|
"rewards/equation_reward_func": 0.32291667233221233, |
|
"rewards/format_reward_func": 0.9609375186264515, |
|
"step": 340 |
|
}, |
|
{ |
|
"completion_length": 284.09896516799927, |
|
"epoch": 0.1824, |
|
"grad_norm": 0.09965326339693975, |
|
"kl": 0.2493896484375, |
|
"learning_rate": 7.195414243467029e-08, |
|
"loss": 0.0002, |
|
"reward": 1.3906250335276127, |
|
"reward_std": 0.27104497281834483, |
|
"rewards/equation_reward_func": 0.4401041779201478, |
|
"rewards/format_reward_func": 0.9505208469927311, |
|
"step": 342 |
|
}, |
|
{ |
|
"completion_length": 282.057297706604, |
|
"epoch": 0.18346666666666667, |
|
"grad_norm": 0.08762325381098879, |
|
"kl": 0.32757568359375, |
|
"learning_rate": 6.944361510801763e-08, |
|
"loss": 0.0003, |
|
"reward": 1.2994792014360428, |
|
"reward_std": 0.23241478390991688, |
|
"rewards/equation_reward_func": 0.3385416711680591, |
|
"rewards/format_reward_func": 0.9609375186264515, |
|
"step": 344 |
|
}, |
|
{ |
|
"completion_length": 279.9739685058594, |
|
"epoch": 0.18453333333333333, |
|
"grad_norm": 0.14683183029957406, |
|
"kl": 0.6046142578125, |
|
"learning_rate": 6.697058439888283e-08, |
|
"loss": 0.0006, |
|
"reward": 1.3697917088866234, |
|
"reward_std": 0.26870738714933395, |
|
"rewards/equation_reward_func": 0.41406251629814506, |
|
"rewards/format_reward_func": 0.955729179084301, |
|
"step": 346 |
|
}, |
|
{ |
|
"completion_length": 270.08594703674316, |
|
"epoch": 0.1856, |
|
"grad_norm": 0.12276857645312758, |
|
"kl": 0.24163818359375, |
|
"learning_rate": 6.453556388803288e-08, |
|
"loss": 0.0002, |
|
"reward": 1.4062500484287739, |
|
"reward_std": 0.28894974663853645, |
|
"rewards/equation_reward_func": 0.4557291795499623, |
|
"rewards/format_reward_func": 0.9505208544433117, |
|
"step": 348 |
|
}, |
|
{ |
|
"completion_length": 256.07552909851074, |
|
"epoch": 0.18666666666666668, |
|
"grad_norm": 0.1261473193256241, |
|
"kl": 0.29376220703125, |
|
"learning_rate": 6.213905926255697e-08, |
|
"loss": 0.0003, |
|
"reward": 1.4479167014360428, |
|
"reward_std": 0.250754666980356, |
|
"rewards/equation_reward_func": 0.5078125149011612, |
|
"rewards/format_reward_func": 0.9401041902601719, |
|
"step": 350 |
|
}, |
|
{ |
|
"completion_length": 249.1354274749756, |
|
"epoch": 0.18773333333333334, |
|
"grad_norm": 0.08506597582252638, |
|
"kl": 0.244384765625, |
|
"learning_rate": 5.978156821084987e-08, |
|
"loss": 0.0002, |
|
"reward": 1.432291705161333, |
|
"reward_std": 0.19336163811385632, |
|
"rewards/equation_reward_func": 0.47916667349636555, |
|
"rewards/format_reward_func": 0.9531250260770321, |
|
"step": 352 |
|
}, |
|
{ |
|
"completion_length": 277.8489685058594, |
|
"epoch": 0.1888, |
|
"grad_norm": 0.12037895470125451, |
|
"kl": 0.23748779296875, |
|
"learning_rate": 5.7463580319254853e-08, |
|
"loss": 0.0002, |
|
"reward": 1.3437500521540642, |
|
"reward_std": 0.2519768704660237, |
|
"rewards/equation_reward_func": 0.4114583432674408, |
|
"rewards/format_reward_func": 0.9322916865348816, |
|
"step": 354 |
|
}, |
|
{ |
|
"completion_length": 252.59375858306885, |
|
"epoch": 0.18986666666666666, |
|
"grad_norm": 0.11733297431372698, |
|
"kl": 0.239013671875, |
|
"learning_rate": 5.518557697039081e-08, |
|
"loss": 0.0002, |
|
"reward": 1.4557292014360428, |
|
"reward_std": 0.2128398958593607, |
|
"rewards/equation_reward_func": 0.4921875107102096, |
|
"rewards/format_reward_func": 0.9635416828095913, |
|
"step": 356 |
|
}, |
|
{ |
|
"completion_length": 252.0677137374878, |
|
"epoch": 0.19093333333333334, |
|
"grad_norm": 0.08775856965094549, |
|
"kl": 2.04522705078125, |
|
"learning_rate": 5.294803124318145e-08, |
|
"loss": 0.0021, |
|
"reward": 1.5104167088866234, |
|
"reward_std": 0.2261988613754511, |
|
"rewards/equation_reward_func": 0.5338541800156236, |
|
"rewards/format_reward_func": 0.9765625149011612, |
|
"step": 358 |
|
}, |
|
{ |
|
"completion_length": 266.70313262939453, |
|
"epoch": 0.192, |
|
"grad_norm": 0.10933086508784831, |
|
"kl": 0.2430419921875, |
|
"learning_rate": 5.07514078146106e-08, |
|
"loss": 0.0002, |
|
"reward": 1.3984375409781933, |
|
"reward_std": 0.22465246403589845, |
|
"rewards/equation_reward_func": 0.4401041786186397, |
|
"rewards/format_reward_func": 0.9583333507180214, |
|
"step": 360 |
|
}, |
|
{ |
|
"completion_length": 244.33073902130127, |
|
"epoch": 0.19306666666666666, |
|
"grad_norm": 0.09058401208636457, |
|
"kl": 0.2347412109375, |
|
"learning_rate": 4.859616286322094e-08, |
|
"loss": 0.0002, |
|
"reward": 1.4895833730697632, |
|
"reward_std": 0.20016511622816324, |
|
"rewards/equation_reward_func": 0.5260416809469461, |
|
"rewards/format_reward_func": 0.9635416828095913, |
|
"step": 362 |
|
}, |
|
{ |
|
"completion_length": 254.1927146911621, |
|
"epoch": 0.19413333333333332, |
|
"grad_norm": 0.06760472710437652, |
|
"kl": 0.24163818359375, |
|
"learning_rate": 4.648274397437829e-08, |
|
"loss": 0.0002, |
|
"reward": 1.416666705161333, |
|
"reward_std": 0.1794181428849697, |
|
"rewards/equation_reward_func": 0.447916679084301, |
|
"rewards/format_reward_func": 0.9687500186264515, |
|
"step": 364 |
|
}, |
|
{ |
|
"completion_length": 265.5052156448364, |
|
"epoch": 0.1952, |
|
"grad_norm": 0.09934227406541099, |
|
"kl": 0.24114990234375, |
|
"learning_rate": 4.4411590047320617e-08, |
|
"loss": 0.0002, |
|
"reward": 1.437500037252903, |
|
"reward_std": 0.23727863328531384, |
|
"rewards/equation_reward_func": 0.47916667722165585, |
|
"rewards/format_reward_func": 0.9583333507180214, |
|
"step": 366 |
|
}, |
|
{ |
|
"completion_length": 287.24219131469727, |
|
"epoch": 0.19626666666666667, |
|
"grad_norm": 0.08869368411582416, |
|
"kl": 0.2509765625, |
|
"learning_rate": 4.2383131204010494e-08, |
|
"loss": 0.0003, |
|
"reward": 1.3229166977107525, |
|
"reward_std": 0.2696537869051099, |
|
"rewards/equation_reward_func": 0.36718750558793545, |
|
"rewards/format_reward_func": 0.9557291828095913, |
|
"step": 368 |
|
}, |
|
{ |
|
"completion_length": 250.90886116027832, |
|
"epoch": 0.19733333333333333, |
|
"grad_norm": 0.11141469624967881, |
|
"kl": 0.2425537109375, |
|
"learning_rate": 4.039778869981064e-08, |
|
"loss": 0.0002, |
|
"reward": 1.408854205161333, |
|
"reward_std": 0.2594145955517888, |
|
"rewards/equation_reward_func": 0.45572917349636555, |
|
"rewards/format_reward_func": 0.9531250223517418, |
|
"step": 370 |
|
}, |
|
{ |
|
"completion_length": 288.8619861602783, |
|
"epoch": 0.1984, |
|
"grad_norm": 0.09328173881518842, |
|
"kl": 0.260009765625, |
|
"learning_rate": 3.845597483600049e-08, |
|
"loss": 0.0003, |
|
"reward": 1.2708333618938923, |
|
"reward_std": 0.24974829843267798, |
|
"rewards/equation_reward_func": 0.3281250046566129, |
|
"rewards/format_reward_func": 0.942708358168602, |
|
"step": 372 |
|
}, |
|
{ |
|
"completion_length": 257.04167652130127, |
|
"epoch": 0.19946666666666665, |
|
"grad_norm": 0.11987502766296552, |
|
"kl": 0.26214599609375, |
|
"learning_rate": 3.655809287415284e-08, |
|
"loss": 0.0003, |
|
"reward": 1.4140625521540642, |
|
"reward_std": 0.23185446253046393, |
|
"rewards/equation_reward_func": 0.45572918001562357, |
|
"rewards/format_reward_func": 0.9583333544433117, |
|
"step": 374 |
|
}, |
|
{ |
|
"completion_length": 246.8671932220459, |
|
"epoch": 0.20053333333333334, |
|
"grad_norm": 0.07938676127449044, |
|
"kl": 0.2530517578125, |
|
"learning_rate": 3.4704536952387285e-08, |
|
"loss": 0.0003, |
|
"reward": 1.4531250298023224, |
|
"reward_std": 0.2485762145370245, |
|
"rewards/equation_reward_func": 0.4843750090803951, |
|
"rewards/format_reward_func": 0.9687500186264515, |
|
"step": 376 |
|
}, |
|
{ |
|
"completion_length": 271.40625762939453, |
|
"epoch": 0.2016, |
|
"grad_norm": 0.10397425885690677, |
|
"kl": 0.253662109375, |
|
"learning_rate": 3.2895692003518575e-08, |
|
"loss": 0.0003, |
|
"reward": 1.372395884245634, |
|
"reward_std": 0.2290022149682045, |
|
"rewards/equation_reward_func": 0.419270847691223, |
|
"rewards/format_reward_func": 0.9531250149011612, |
|
"step": 378 |
|
}, |
|
{ |
|
"completion_length": 277.70313262939453, |
|
"epoch": 0.20266666666666666, |
|
"grad_norm": 0.11176010513775461, |
|
"kl": 0.2552490234375, |
|
"learning_rate": 3.113193367511635e-08, |
|
"loss": 0.0003, |
|
"reward": 1.3489583693444729, |
|
"reward_std": 0.3045574314892292, |
|
"rewards/equation_reward_func": 0.4114583481568843, |
|
"rewards/format_reward_func": 0.9375000186264515, |
|
"step": 380 |
|
}, |
|
{ |
|
"completion_length": 264.783860206604, |
|
"epoch": 0.20373333333333332, |
|
"grad_norm": 0.09633127157125651, |
|
"kl": 0.2548828125, |
|
"learning_rate": 2.9413628251493934e-08, |
|
"loss": 0.0003, |
|
"reward": 1.3932292014360428, |
|
"reward_std": 0.26427287235856056, |
|
"rewards/equation_reward_func": 0.4401041737291962, |
|
"rewards/format_reward_func": 0.9531250111758709, |
|
"step": 382 |
|
}, |
|
{ |
|
"completion_length": 249.04167366027832, |
|
"epoch": 0.2048, |
|
"grad_norm": 0.06042361226548213, |
|
"kl": 0.25054931640625, |
|
"learning_rate": 2.774113257764066e-08, |
|
"loss": 0.0003, |
|
"reward": 1.4322917088866234, |
|
"reward_std": 0.19768574135378003, |
|
"rewards/equation_reward_func": 0.4817708507180214, |
|
"rewards/format_reward_func": 0.9505208469927311, |
|
"step": 384 |
|
}, |
|
{ |
|
"completion_length": 289.47136306762695, |
|
"epoch": 0.20586666666666667, |
|
"grad_norm": 0.09636095745621918, |
|
"kl": 0.24542236328125, |
|
"learning_rate": 2.611479398511518e-08, |
|
"loss": 0.0002, |
|
"reward": 1.291666705161333, |
|
"reward_std": 0.22107936535030603, |
|
"rewards/equation_reward_func": 0.3463541760575026, |
|
"rewards/format_reward_func": 0.9453125186264515, |
|
"step": 386 |
|
}, |
|
{ |
|
"completion_length": 247.9687581062317, |
|
"epoch": 0.20693333333333333, |
|
"grad_norm": 0.10748505650467376, |
|
"kl": 0.2657470703125, |
|
"learning_rate": 2.4534950219914057e-08, |
|
"loss": 0.0003, |
|
"reward": 1.494791705161333, |
|
"reward_std": 0.24816493690013885, |
|
"rewards/equation_reward_func": 0.5312500125728548, |
|
"rewards/format_reward_func": 0.9635416828095913, |
|
"step": 388 |
|
}, |
|
{ |
|
"completion_length": 248.68490505218506, |
|
"epoch": 0.208, |
|
"grad_norm": 0.11757891850912854, |
|
"kl": 0.2335205078125, |
|
"learning_rate": 2.300192937233128e-08, |
|
"loss": 0.0002, |
|
"reward": 1.4505208656191826, |
|
"reward_std": 0.22064228588715196, |
|
"rewards/equation_reward_func": 0.4895833458285779, |
|
"rewards/format_reward_func": 0.9609375186264515, |
|
"step": 390 |
|
}, |
|
{ |
|
"completion_length": 271.49480056762695, |
|
"epoch": 0.20906666666666668, |
|
"grad_norm": 0.07918511324806074, |
|
"kl": 0.23931884765625, |
|
"learning_rate": 2.1516049808822935e-08, |
|
"loss": 0.0002, |
|
"reward": 1.3515625409781933, |
|
"reward_std": 0.18755131447687745, |
|
"rewards/equation_reward_func": 0.38020834140479565, |
|
"rewards/format_reward_func": 0.9713541828095913, |
|
"step": 392 |
|
}, |
|
{ |
|
"completion_length": 259.51563835144043, |
|
"epoch": 0.21013333333333334, |
|
"grad_norm": 0.1495234231858708, |
|
"kl": 0.241455078125, |
|
"learning_rate": 2.007762010589098e-08, |
|
"loss": 0.0002, |
|
"reward": 1.4947917088866234, |
|
"reward_std": 0.33302151458337903, |
|
"rewards/equation_reward_func": 0.5468750111758709, |
|
"rewards/format_reward_func": 0.9479166828095913, |
|
"step": 394 |
|
}, |
|
{ |
|
"completion_length": 280.27344608306885, |
|
"epoch": 0.2112, |
|
"grad_norm": 0.10448152858384566, |
|
"kl": 0.25335693359375, |
|
"learning_rate": 1.8686938986000627e-08, |
|
"loss": 0.0003, |
|
"reward": 1.3593750335276127, |
|
"reward_std": 0.2329879915341735, |
|
"rewards/equation_reward_func": 0.3984375107102096, |
|
"rewards/format_reward_func": 0.9609375186264515, |
|
"step": 396 |
|
}, |
|
{ |
|
"completion_length": 243.24479961395264, |
|
"epoch": 0.21226666666666666, |
|
"grad_norm": 0.10158686561243806, |
|
"kl": 0.26129150390625, |
|
"learning_rate": 1.734429525554365e-08, |
|
"loss": 0.0003, |
|
"reward": 1.5312500447034836, |
|
"reward_std": 0.26366367703303695, |
|
"rewards/equation_reward_func": 0.5677083469927311, |
|
"rewards/format_reward_func": 0.9635416828095913, |
|
"step": 398 |
|
}, |
|
{ |
|
"completion_length": 262.8463611602783, |
|
"epoch": 0.21333333333333335, |
|
"grad_norm": 0.0879686678616527, |
|
"kl": 0.23956298828125, |
|
"learning_rate": 1.604996774486145e-08, |
|
"loss": 0.0002, |
|
"reward": 1.4479167088866234, |
|
"reward_std": 0.2433197470381856, |
|
"rewards/equation_reward_func": 0.4973958428017795, |
|
"rewards/format_reward_func": 0.9505208544433117, |
|
"step": 400 |
|
}, |
|
{ |
|
"completion_length": 255.82292366027832, |
|
"epoch": 0.2144, |
|
"grad_norm": 0.09500435271087032, |
|
"kl": 0.23638916015625, |
|
"learning_rate": 1.4804225250339281e-08, |
|
"loss": 0.0002, |
|
"reward": 1.4192708730697632, |
|
"reward_std": 0.23259615385904908, |
|
"rewards/equation_reward_func": 0.4609375165309757, |
|
"rewards/format_reward_func": 0.9583333469927311, |
|
"step": 402 |
|
}, |
|
{ |
|
"completion_length": 217.63802671432495, |
|
"epoch": 0.21546666666666667, |
|
"grad_norm": 0.10034531857683562, |
|
"kl": 0.2501220703125, |
|
"learning_rate": 1.360732647858498e-08, |
|
"loss": 0.0003, |
|
"reward": 1.5156250335276127, |
|
"reward_std": 0.17062418861314654, |
|
"rewards/equation_reward_func": 0.5442708432674408, |
|
"rewards/format_reward_func": 0.9713541865348816, |
|
"step": 404 |
|
}, |
|
{ |
|
"completion_length": 234.06771516799927, |
|
"epoch": 0.21653333333333333, |
|
"grad_norm": 0.12227115923971459, |
|
"kl": 0.248779296875, |
|
"learning_rate": 1.2459519992702311e-08, |
|
"loss": 0.0002, |
|
"reward": 1.4921875298023224, |
|
"reward_std": 0.23618489829823375, |
|
"rewards/equation_reward_func": 0.5234375111758709, |
|
"rewards/format_reward_func": 0.9687500149011612, |
|
"step": 406 |
|
}, |
|
{ |
|
"completion_length": 243.70052909851074, |
|
"epoch": 0.2176, |
|
"grad_norm": 0.15926056972625335, |
|
"kl": 0.27069091796875, |
|
"learning_rate": 1.1361044160671629e-08, |
|
"loss": 0.0003, |
|
"reward": 1.4505208618938923, |
|
"reward_std": 0.28682674188166857, |
|
"rewards/equation_reward_func": 0.5052083535119891, |
|
"rewards/format_reward_func": 0.9453125149011612, |
|
"step": 408 |
|
}, |
|
{ |
|
"completion_length": 275.8385486602783, |
|
"epoch": 0.21866666666666668, |
|
"grad_norm": 0.12757068910568817, |
|
"kl": 0.24969482421875, |
|
"learning_rate": 1.0312127105846947e-08, |
|
"loss": 0.0002, |
|
"reward": 1.3645833730697632, |
|
"reward_std": 0.21345845330506563, |
|
"rewards/equation_reward_func": 0.40104167675599456, |
|
"rewards/format_reward_func": 0.9635416828095913, |
|
"step": 410 |
|
}, |
|
{ |
|
"completion_length": 287.200532913208, |
|
"epoch": 0.21973333333333334, |
|
"grad_norm": 0.11457759488995656, |
|
"kl": 0.244384765625, |
|
"learning_rate": 9.312986659581301e-09, |
|
"loss": 0.0002, |
|
"reward": 1.3229166977107525, |
|
"reward_std": 0.21469376189634204, |
|
"rewards/equation_reward_func": 0.3593750111758709, |
|
"rewards/format_reward_func": 0.9635416865348816, |
|
"step": 412 |
|
}, |
|
{ |
|
"completion_length": 265.48438358306885, |
|
"epoch": 0.2208, |
|
"grad_norm": 0.12852298602657852, |
|
"kl": 0.2640380859375, |
|
"learning_rate": 8.363830315988945e-09, |
|
"loss": 0.0003, |
|
"reward": 1.3463541828095913, |
|
"reward_std": 0.23709475807845592, |
|
"rewards/equation_reward_func": 0.39583334093913436, |
|
"rewards/format_reward_func": 0.9505208507180214, |
|
"step": 414 |
|
}, |
|
{ |
|
"completion_length": 271.54688358306885, |
|
"epoch": 0.22186666666666666, |
|
"grad_norm": 0.08478231012580131, |
|
"kl": 0.28369140625, |
|
"learning_rate": 7.46485518885462e-09, |
|
"loss": 0.0003, |
|
"reward": 1.3489583693444729, |
|
"reward_std": 0.22044954542070627, |
|
"rewards/equation_reward_func": 0.3932291807141155, |
|
"rewards/format_reward_func": 0.9557291828095913, |
|
"step": 416 |
|
}, |
|
{ |
|
"completion_length": 243.90625667572021, |
|
"epoch": 0.22293333333333334, |
|
"grad_norm": 0.10846557765273872, |
|
"kl": 0.24072265625, |
|
"learning_rate": 6.616247970698319e-09, |
|
"loss": 0.0002, |
|
"reward": 1.533854205161333, |
|
"reward_std": 0.2181540415622294, |
|
"rewards/equation_reward_func": 0.5598958469927311, |
|
"rewards/format_reward_func": 0.9739583469927311, |
|
"step": 418 |
|
}, |
|
{ |
|
"completion_length": 264.71094608306885, |
|
"epoch": 0.224, |
|
"grad_norm": 0.1142319675311567, |
|
"kl": 0.261962890625, |
|
"learning_rate": 5.8181848940044855e-09, |
|
"loss": 0.0003, |
|
"reward": 1.4114583805203438, |
|
"reward_std": 0.22676061373203993, |
|
"rewards/equation_reward_func": 0.4635416807141155, |
|
"rewards/format_reward_func": 0.9479166828095913, |
|
"step": 420 |
|
}, |
|
{ |
|
"completion_length": 229.906259059906, |
|
"epoch": 0.22506666666666666, |
|
"grad_norm": 0.09559713141008308, |
|
"kl": 0.2418212890625, |
|
"learning_rate": 5.070831694623135e-09, |
|
"loss": 0.0002, |
|
"reward": 1.531250037252903, |
|
"reward_std": 0.19829656789079309, |
|
"rewards/equation_reward_func": 0.5703125149011612, |
|
"rewards/format_reward_func": 0.9609375186264515, |
|
"step": 422 |
|
}, |
|
{ |
|
"completion_length": 248.05209159851074, |
|
"epoch": 0.22613333333333333, |
|
"grad_norm": 0.11040696978140259, |
|
"kl": 0.24212646484375, |
|
"learning_rate": 4.374343577351336e-09, |
|
"loss": 0.0002, |
|
"reward": 1.4192708656191826, |
|
"reward_std": 0.27204828383401036, |
|
"rewards/equation_reward_func": 0.4531250123400241, |
|
"rewards/format_reward_func": 0.9661458544433117, |
|
"step": 424 |
|
}, |
|
{ |
|
"completion_length": 257.89584159851074, |
|
"epoch": 0.2272, |
|
"grad_norm": 0.0992000332189083, |
|
"kl": 0.24200439453125, |
|
"learning_rate": 3.7288651837012745e-09, |
|
"loss": 0.0002, |
|
"reward": 1.4062500298023224, |
|
"reward_std": 0.2646353510208428, |
|
"rewards/equation_reward_func": 0.45312501094304025, |
|
"rewards/format_reward_func": 0.9531250223517418, |
|
"step": 426 |
|
}, |
|
{ |
|
"completion_length": 231.69792461395264, |
|
"epoch": 0.22826666666666667, |
|
"grad_norm": 0.1407050044165881, |
|
"kl": 0.27252197265625, |
|
"learning_rate": 3.134530561862081e-09, |
|
"loss": 0.0003, |
|
"reward": 1.4869792014360428, |
|
"reward_std": 0.15594792971387506, |
|
"rewards/equation_reward_func": 0.5104166809469461, |
|
"rewards/format_reward_func": 0.9765625149011612, |
|
"step": 428 |
|
}, |
|
{ |
|
"completion_length": 276.9817762374878, |
|
"epoch": 0.22933333333333333, |
|
"grad_norm": 0.1167791204621414, |
|
"kl": 0.24884033203125, |
|
"learning_rate": 2.5914631388619103e-09, |
|
"loss": 0.0002, |
|
"reward": 1.3463542088866234, |
|
"reward_std": 0.21618649549782276, |
|
"rewards/equation_reward_func": 0.3906250107102096, |
|
"rewards/format_reward_func": 0.9557291902601719, |
|
"step": 430 |
|
}, |
|
{ |
|
"completion_length": 249.96094417572021, |
|
"epoch": 0.2304, |
|
"grad_norm": 0.10179382560252617, |
|
"kl": 0.24853515625, |
|
"learning_rate": 2.0997756949353297e-09, |
|
"loss": 0.0002, |
|
"reward": 1.4817708656191826, |
|
"reward_std": 0.20142082124948502, |
|
"rewards/equation_reward_func": 0.513020845130086, |
|
"rewards/format_reward_func": 0.9687500111758709, |
|
"step": 432 |
|
}, |
|
{ |
|
"completion_length": 304.00261211395264, |
|
"epoch": 0.23146666666666665, |
|
"grad_norm": 0.12633771333357205, |
|
"kl": 0.2705078125, |
|
"learning_rate": 1.6595703401020844e-09, |
|
"loss": 0.0003, |
|
"reward": 1.2734375409781933, |
|
"reward_std": 0.2817671154625714, |
|
"rewards/equation_reward_func": 0.3307291779201478, |
|
"rewards/format_reward_func": 0.9427083507180214, |
|
"step": 434 |
|
}, |
|
{ |
|
"completion_length": 244.33594417572021, |
|
"epoch": 0.23253333333333334, |
|
"grad_norm": 0.12174371002417166, |
|
"kl": 0.24542236328125, |
|
"learning_rate": 1.2709384929615596e-09, |
|
"loss": 0.0002, |
|
"reward": 1.4817708805203438, |
|
"reward_std": 0.24246670864522457, |
|
"rewards/equation_reward_func": 0.5208333441987634, |
|
"rewards/format_reward_func": 0.9609375186264515, |
|
"step": 436 |
|
}, |
|
{ |
|
"completion_length": 274.6692781448364, |
|
"epoch": 0.2336, |
|
"grad_norm": 0.08153629624949502, |
|
"kl": 0.2364501953125, |
|
"learning_rate": 9.339608617077165e-10, |
|
"loss": 0.0002, |
|
"reward": 1.3958333730697632, |
|
"reward_std": 0.18359084147959948, |
|
"rewards/equation_reward_func": 0.44531250884756446, |
|
"rewards/format_reward_func": 0.9505208544433117, |
|
"step": 438 |
|
}, |
|
{ |
|
"completion_length": 271.5208406448364, |
|
"epoch": 0.23466666666666666, |
|
"grad_norm": 0.10062195336090982, |
|
"kl": 0.25860595703125, |
|
"learning_rate": 6.487074273681114e-10, |
|
"loss": 0.0003, |
|
"reward": 1.3567708730697632, |
|
"reward_std": 0.2888470063917339, |
|
"rewards/equation_reward_func": 0.4062500102445483, |
|
"rewards/format_reward_func": 0.950520858168602, |
|
"step": 440 |
|
}, |
|
{ |
|
"completion_length": 279.31511306762695, |
|
"epoch": 0.23573333333333332, |
|
"grad_norm": 0.11347953554392516, |
|
"kl": 0.27435302734375, |
|
"learning_rate": 4.152374292708538e-10, |
|
"loss": 0.0003, |
|
"reward": 1.328125037252903, |
|
"reward_std": 0.2600484313443303, |
|
"rewards/equation_reward_func": 0.3723958460614085, |
|
"rewards/format_reward_func": 0.9557291902601719, |
|
"step": 442 |
|
}, |
|
{ |
|
"completion_length": 229.1927137374878, |
|
"epoch": 0.2368, |
|
"grad_norm": 0.08761777334438094, |
|
"kl": 0.23480224609375, |
|
"learning_rate": 2.3359935274214204e-10, |
|
"loss": 0.0002, |
|
"reward": 1.5416667088866234, |
|
"reward_std": 0.19063151394948363, |
|
"rewards/equation_reward_func": 0.5651041842065752, |
|
"rewards/format_reward_func": 0.9765625149011612, |
|
"step": 444 |
|
}, |
|
{ |
|
"completion_length": 269.6927146911621, |
|
"epoch": 0.23786666666666667, |
|
"grad_norm": 0.08325007668726372, |
|
"kl": 0.24908447265625, |
|
"learning_rate": 1.0383091903720665e-10, |
|
"loss": 0.0002, |
|
"reward": 1.3802083730697632, |
|
"reward_std": 0.19487999146804214, |
|
"rewards/equation_reward_func": 0.41927084675990045, |
|
"rewards/format_reward_func": 0.9609375223517418, |
|
"step": 446 |
|
}, |
|
{ |
|
"completion_length": 252.57813453674316, |
|
"epoch": 0.23893333333333333, |
|
"grad_norm": 0.07984790038875238, |
|
"kl": 0.24322509765625, |
|
"learning_rate": 2.595907750671533e-11, |
|
"loss": 0.0002, |
|
"reward": 1.4505208730697632, |
|
"reward_std": 0.1807808456942439, |
|
"rewards/equation_reward_func": 0.47656251257285476, |
|
"rewards/format_reward_func": 0.9739583469927311, |
|
"step": 448 |
|
}, |
|
{ |
|
"completion_length": 277.79948806762695, |
|
"epoch": 0.24, |
|
"grad_norm": 0.11593052361546653, |
|
"kl": 0.26580810546875, |
|
"learning_rate": 0.0, |
|
"loss": 0.0003, |
|
"reward": 1.3541667088866234, |
|
"reward_std": 0.2557070981711149, |
|
"rewards/equation_reward_func": 0.40364584303461015, |
|
"rewards/format_reward_func": 0.9505208469927311, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"step": 450, |
|
"total_flos": 0.0, |
|
"train_loss": 0.00020930594997387746, |
|
"train_runtime": 20107.4364, |
|
"train_samples_per_second": 0.537, |
|
"train_steps_per_second": 0.022 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 450, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|