|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 16.666666666666668, |
|
"eval_steps": 500, |
|
"global_step": 200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 405.25, |
|
"epoch": 0.16666666666666666, |
|
"grad_norm": 0.4627874195575714, |
|
"kl": 0.005518825375474989, |
|
"learning_rate": 8.333333333333333e-08, |
|
"loss": 0.0, |
|
"reward": 0.9609934687614441, |
|
"reward_std": 0.9843197166919708, |
|
"rewards/concensus_correctness_reward_func": 0.0403750017285347, |
|
"rewards/consensus_reward_func": 0.25, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.48086845502257347, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.17412499990314245, |
|
"step": 2 |
|
}, |
|
{ |
|
"completion_length": 456.9375, |
|
"epoch": 0.3333333333333333, |
|
"grad_norm": 0.34028512239456177, |
|
"kl": 0.0030479243432637304, |
|
"learning_rate": 2.5e-07, |
|
"loss": 0.0, |
|
"reward": 0.9757307097315788, |
|
"reward_std": 1.0358425956219435, |
|
"rewards/concensus_correctness_reward_func": 0.013249999843537807, |
|
"rewards/consensus_reward_func": 0.3125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.42269944585859776, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.21165626123547554, |
|
"step": 4 |
|
}, |
|
{ |
|
"completion_length": 541.40625, |
|
"epoch": 0.5, |
|
"grad_norm": 0.6734604835510254, |
|
"kl": 0.0027404608263168484, |
|
"learning_rate": 4.1666666666666667e-07, |
|
"loss": 0.0, |
|
"reward": 0.9920943900942802, |
|
"reward_std": 0.8858779668807983, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.5916256122291088, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.03125, |
|
"rewards/xmlcount_reward_func": 0.24421875784173608, |
|
"step": 6 |
|
}, |
|
{ |
|
"completion_length": 510.90625, |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 0.38359829783439636, |
|
"kl": 0.004760640629683621, |
|
"learning_rate": 4.99967220916408e-07, |
|
"loss": 0.0, |
|
"reward": 1.3815101590007544, |
|
"reward_std": 0.6916671060025692, |
|
"rewards/concensus_correctness_reward_func": 0.06412499956786633, |
|
"rewards/consensus_reward_func": 0.1875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0625, |
|
"rewards/question_recreation_reward_func": 0.6536664366722107, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.3668437581509352, |
|
"step": 8 |
|
}, |
|
{ |
|
"completion_length": 425.09375, |
|
"epoch": 0.8333333333333334, |
|
"grad_norm": 0.5544623732566833, |
|
"kl": 0.0043694232881534845, |
|
"learning_rate": 4.997050398198976e-07, |
|
"loss": 0.0, |
|
"reward": 1.6439796090126038, |
|
"reward_std": 0.8410376366227865, |
|
"rewards/concensus_correctness_reward_func": 0.0768750011920929, |
|
"rewards/consensus_reward_func": 0.9375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.5118233747780323, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.10215624794363976, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 424.5, |
|
"epoch": 1.0, |
|
"grad_norm": 0.3566124141216278, |
|
"kl": 0.0029268394282553345, |
|
"learning_rate": 4.991809526186423e-07, |
|
"loss": 0.0, |
|
"reward": 0.7868102770298719, |
|
"reward_std": 0.9274830296635628, |
|
"rewards/concensus_correctness_reward_func": 0.04543749988079071, |
|
"rewards/consensus_reward_func": 0.0625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.1875, |
|
"rewards/question_recreation_reward_func": 0.5272165313363075, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.03584375884383917, |
|
"step": 12 |
|
}, |
|
{ |
|
"completion_length": 455.28125, |
|
"epoch": 1.1666666666666667, |
|
"grad_norm": 0.4540335536003113, |
|
"kl": 0.0037232608010526747, |
|
"learning_rate": 4.983955090077444e-07, |
|
"loss": 0.0, |
|
"reward": 0.9765743091702461, |
|
"reward_std": 1.24457941763103, |
|
"rewards/concensus_correctness_reward_func": 0.014875000342726707, |
|
"rewards/consensus_reward_func": 0.375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.4753867909312248, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.09568749740719795, |
|
"step": 14 |
|
}, |
|
{ |
|
"completion_length": 377.21875, |
|
"epoch": 1.3333333333333333, |
|
"grad_norm": 1.5153367519378662, |
|
"kl": 0.0050389311800245196, |
|
"learning_rate": 4.973495328090889e-07, |
|
"loss": 0.0, |
|
"reward": 1.0638171210885048, |
|
"reward_std": 1.0938183590769768, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.1875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0625, |
|
"rewards/question_recreation_reward_func": 0.5703795924782753, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.1965624913573265, |
|
"step": 16 |
|
}, |
|
{ |
|
"completion_length": 472.5, |
|
"epoch": 1.5, |
|
"grad_norm": 0.3531189560890198, |
|
"kl": 0.0040535782754886895, |
|
"learning_rate": 4.960441211072685e-07, |
|
"loss": 0.0, |
|
"reward": 1.1587776821106672, |
|
"reward_std": 1.1633746810257435, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.5, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0625, |
|
"rewards/question_recreation_reward_func": 0.5646526888012886, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.03162500262260437, |
|
"step": 18 |
|
}, |
|
{ |
|
"completion_length": 432.9375, |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 0.42279884219169617, |
|
"kl": 0.0036564485053531826, |
|
"learning_rate": 4.944806430988927e-07, |
|
"loss": 0.0, |
|
"reward": 0.8500742139294744, |
|
"reward_std": 1.044841218739748, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.4375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.4372929581440985, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": -0.040343739092350006, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 503.65625, |
|
"epoch": 1.8333333333333335, |
|
"grad_norm": 0.35253623127937317, |
|
"kl": 0.002885653229895979, |
|
"learning_rate": 4.926607386564898e-07, |
|
"loss": 0.0, |
|
"reward": 0.9196254201233387, |
|
"reward_std": 1.1270109415054321, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.25, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.6083129234611988, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.061312501318752766, |
|
"step": 22 |
|
}, |
|
{ |
|
"completion_length": 390.3125, |
|
"epoch": 2.0, |
|
"grad_norm": 0.7617405652999878, |
|
"kl": 0.0034773742081597447, |
|
"learning_rate": 4.905863166085075e-07, |
|
"loss": 0.0, |
|
"reward": 0.9421910382807255, |
|
"reward_std": 1.0266394726932049, |
|
"rewards/concensus_correctness_reward_func": 0.0403750017285347, |
|
"rewards/consensus_reward_func": 0.1875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.5005660317838192, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.21375000849366188, |
|
"step": 24 |
|
}, |
|
{ |
|
"completion_length": 446.375, |
|
"epoch": 2.1666666666666665, |
|
"grad_norm": 0.539344310760498, |
|
"kl": 0.0038461567601189017, |
|
"learning_rate": 4.882595527372152e-07, |
|
"loss": 0.0, |
|
"reward": 0.990186957642436, |
|
"reward_std": 1.246212176978588, |
|
"rewards/concensus_correctness_reward_func": 0.025624999776482582, |
|
"rewards/consensus_reward_func": 0.375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0625, |
|
"rewards/question_recreation_reward_func": 0.39828070998191833, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.12878123950213194, |
|
"step": 26 |
|
}, |
|
{ |
|
"completion_length": 484.21875, |
|
"epoch": 2.3333333333333335, |
|
"grad_norm": 0.573151171207428, |
|
"kl": 0.003478569706203416, |
|
"learning_rate": 4.856828874966086e-07, |
|
"loss": 0.0, |
|
"reward": 1.1689446438103914, |
|
"reward_std": 1.0504138097167015, |
|
"rewards/concensus_correctness_reward_func": 0.02199999988079071, |
|
"rewards/consensus_reward_func": 0.4375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.656163401901722, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.0376562625169754, |
|
"step": 28 |
|
}, |
|
{ |
|
"completion_length": 420.25, |
|
"epoch": 2.5, |
|
"grad_norm": 0.5913636088371277, |
|
"kl": 0.004606734742992558, |
|
"learning_rate": 4.828590234527106e-07, |
|
"loss": 0.0, |
|
"reward": 1.556147813796997, |
|
"reward_std": 2.1363642998039722, |
|
"rewards/concensus_correctness_reward_func": 0.625, |
|
"rewards/consensus_reward_func": 0.3125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.3810540623962879, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.221968749538064, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 428.28125, |
|
"epoch": 2.6666666666666665, |
|
"grad_norm": 0.4051259756088257, |
|
"kl": 0.003205618428182788, |
|
"learning_rate": 4.79790922448953e-07, |
|
"loss": 0.0, |
|
"reward": 1.021159190684557, |
|
"reward_std": 0.932405311614275, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.25, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.5468154139816761, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.20871875435113907, |
|
"step": 32 |
|
}, |
|
{ |
|
"completion_length": 477.0, |
|
"epoch": 2.8333333333333335, |
|
"grad_norm": 1.2722704410552979, |
|
"kl": 0.003231980503187515, |
|
"learning_rate": 4.7648180249961165e-07, |
|
"loss": 0.0, |
|
"reward": 1.096991840749979, |
|
"reward_std": 1.2813670113682747, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.5021168403327465, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.03125, |
|
"rewards/xmlcount_reward_func": 0.18862500227987766, |
|
"step": 34 |
|
}, |
|
{ |
|
"completion_length": 517.65625, |
|
"epoch": 3.0, |
|
"grad_norm": 0.5277615785598755, |
|
"kl": 0.003874574991641566, |
|
"learning_rate": 4.7293513441455357e-07, |
|
"loss": 0.0, |
|
"reward": 0.7769100246950984, |
|
"reward_std": 1.269415270537138, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.25, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.5426287725567818, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.031343746930360794, |
|
"step": 36 |
|
}, |
|
{ |
|
"completion_length": 507.5, |
|
"epoch": 3.1666666666666665, |
|
"grad_norm": 0.4287290871143341, |
|
"kl": 0.0033028597244992852, |
|
"learning_rate": 4.691546381588369e-07, |
|
"loss": 0.0, |
|
"reward": 0.550812273286283, |
|
"reward_std": 1.1209257319569588, |
|
"rewards/concensus_correctness_reward_func": 0.02306249924004078, |
|
"rewards/consensus_reward_func": 0.25, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.3885622890666127, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.11081249732524157, |
|
"step": 38 |
|
}, |
|
{ |
|
"completion_length": 508.34375, |
|
"epoch": 3.3333333333333335, |
|
"grad_norm": 0.40618908405303955, |
|
"kl": 0.0037306795711629093, |
|
"learning_rate": 4.651442789509813e-07, |
|
"loss": 0.0, |
|
"reward": 0.7808872517198324, |
|
"reward_std": 0.8644677195698023, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.6908872574567795, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.034999994561076164, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 409.46875, |
|
"epoch": 3.5, |
|
"grad_norm": 0.39888831973075867, |
|
"kl": 0.004624970955774188, |
|
"learning_rate": 4.609082631040011e-07, |
|
"loss": 0.0, |
|
"reward": 1.320845801383257, |
|
"reward_std": 1.0828375816345215, |
|
"rewards/concensus_correctness_reward_func": 0.07012500241398811, |
|
"rewards/consensus_reward_func": 0.375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.5185021087527275, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.03125, |
|
"rewards/xmlcount_reward_func": 0.3259687628597021, |
|
"step": 42 |
|
}, |
|
{ |
|
"completion_length": 454.125, |
|
"epoch": 3.6666666666666665, |
|
"grad_norm": 5.556149959564209, |
|
"kl": 0.003802574341534637, |
|
"learning_rate": 4.5645103361356407e-07, |
|
"loss": 0.0, |
|
"reward": 0.8918353654444218, |
|
"reward_std": 1.0566863603889942, |
|
"rewards/concensus_correctness_reward_func": 0.025624999776482582, |
|
"rewards/consensus_reward_func": 0.25, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.5350853689014912, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.08112499676644802, |
|
"step": 44 |
|
}, |
|
{ |
|
"completion_length": 457.34375, |
|
"epoch": 3.8333333333333335, |
|
"grad_norm": 0.35926172137260437, |
|
"kl": 0.003958026180043817, |
|
"learning_rate": 4.517772654979023e-07, |
|
"loss": 0.0, |
|
"reward": 0.9805956035852432, |
|
"reward_std": 0.7859199158847332, |
|
"rewards/concensus_correctness_reward_func": 0.0260624997317791, |
|
"rewards/consensus_reward_func": 0.125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.5270956140011549, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.30243751779198647, |
|
"step": 46 |
|
}, |
|
{ |
|
"completion_length": 429.90625, |
|
"epoch": 4.0, |
|
"grad_norm": 0.5926030874252319, |
|
"kl": 0.005827751272590831, |
|
"learning_rate": 4.468918608943636e-07, |
|
"loss": 0.0, |
|
"reward": 1.7501742951571941, |
|
"reward_std": 2.434488591738045, |
|
"rewards/concensus_correctness_reward_func": 0.625, |
|
"rewards/consensus_reward_func": 0.375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0625, |
|
"rewards/question_recreation_reward_func": 0.38520555198192596, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.25559374690055847, |
|
"step": 48 |
|
}, |
|
{ |
|
"completion_length": 463.65625, |
|
"epoch": 4.166666666666667, |
|
"grad_norm": 0.7409794330596924, |
|
"kl": 0.00406863066018559, |
|
"learning_rate": 4.417999439177465e-07, |
|
"loss": 0.0, |
|
"reward": 1.2483692020177841, |
|
"reward_std": 0.8449539989233017, |
|
"rewards/concensus_correctness_reward_func": 0.0403750017285347, |
|
"rewards/consensus_reward_func": 0.3125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0625, |
|
"rewards/question_recreation_reward_func": 0.359212932176888, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.4269062578678131, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 437.71875, |
|
"epoch": 4.333333333333333, |
|
"grad_norm": 0.39966848492622375, |
|
"kl": 0.0033464983571320772, |
|
"learning_rate": 4.365068552858115e-07, |
|
"loss": 0.0, |
|
"reward": 1.5179361971095204, |
|
"reward_std": 1.1835376657545567, |
|
"rewards/concensus_correctness_reward_func": 0.02199999988079071, |
|
"rewards/consensus_reward_func": 0.625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.5100300014019012, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.3452812507748604, |
|
"step": 52 |
|
}, |
|
{ |
|
"completion_length": 402.59375, |
|
"epoch": 4.5, |
|
"grad_norm": 0.5886970162391663, |
|
"kl": 0.004897425736999139, |
|
"learning_rate": 4.310181467176054e-07, |
|
"loss": 0.0, |
|
"reward": 1.4094564961269498, |
|
"reward_std": 0.9502353556454182, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.6875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.5278940042480826, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.178437490016222, |
|
"step": 54 |
|
}, |
|
{ |
|
"completion_length": 432.09375, |
|
"epoch": 4.666666666666667, |
|
"grad_norm": 0.41223806142807007, |
|
"kl": 0.004628591472283006, |
|
"learning_rate": 4.253395751104748e-07, |
|
"loss": 0.0, |
|
"reward": 2.7664652466773987, |
|
"reward_std": 2.371488120406866, |
|
"rewards/concensus_correctness_reward_func": 1.2650624997913837, |
|
"rewards/consensus_reward_func": 0.5625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0625, |
|
"rewards/question_recreation_reward_func": 0.6736840140074492, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.20271877851337194, |
|
"step": 56 |
|
}, |
|
{ |
|
"completion_length": 458.21875, |
|
"epoch": 4.833333333333333, |
|
"grad_norm": 1.161920428276062, |
|
"kl": 0.004868229589192197, |
|
"learning_rate": 4.194770965018758e-07, |
|
"loss": 0.0, |
|
"reward": 0.8611085470765829, |
|
"reward_std": 0.9239636212587357, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.25, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.4858272895216942, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.03125, |
|
"rewards/xmlcount_reward_func": 0.0940312510356307, |
|
"step": 58 |
|
}, |
|
{ |
|
"completion_length": 575.1875, |
|
"epoch": 5.0, |
|
"grad_norm": 0.4627440869808197, |
|
"kl": 0.0028841852181358263, |
|
"learning_rate": 4.1343685982231315e-07, |
|
"loss": 0.0, |
|
"reward": 0.7771566994488239, |
|
"reward_std": 0.9787662029266357, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.6247504502534866, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.03125, |
|
"rewards/xmlcount_reward_func": -0.0038437489420175552, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 452.625, |
|
"epoch": 5.166666666666667, |
|
"grad_norm": 0.372567355632782, |
|
"kl": 0.0034624057734617963, |
|
"learning_rate": 4.072252004459611e-07, |
|
"loss": 0.0, |
|
"reward": 0.7446466162800789, |
|
"reward_std": 0.7917606569826603, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.0625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.5209591202437878, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.14556249883025885, |
|
"step": 62 |
|
}, |
|
{ |
|
"completion_length": 462.46875, |
|
"epoch": 5.333333333333333, |
|
"grad_norm": 0.855252742767334, |
|
"kl": 0.005477891769260168, |
|
"learning_rate": 4.0084863354573116e-07, |
|
"loss": 0.0, |
|
"reward": 1.2447932958602905, |
|
"reward_std": 1.16534423828125, |
|
"rewards/concensus_correctness_reward_func": 0.04312499985098839, |
|
"rewards/consensus_reward_func": 0.4375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.6511683035641909, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.0973750026896596, |
|
"step": 64 |
|
}, |
|
{ |
|
"completion_length": 552.125, |
|
"epoch": 5.5, |
|
"grad_norm": 0.7227801084518433, |
|
"kl": 0.0039346517296507955, |
|
"learning_rate": 3.9431384725975485e-07, |
|
"loss": 0.0, |
|
"reward": 1.028742030262947, |
|
"reward_std": 1.049393068999052, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.25, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.6304295305162668, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.03125, |
|
"rewards/xmlcount_reward_func": 0.11706250137649477, |
|
"step": 66 |
|
}, |
|
{ |
|
"completion_length": 403.0, |
|
"epoch": 5.666666666666667, |
|
"grad_norm": 0.4475947320461273, |
|
"kl": 0.003715590573847294, |
|
"learning_rate": 3.876276956764509e-07, |
|
"loss": 0.0, |
|
"reward": 0.9110647588968277, |
|
"reward_std": 0.9707382656633854, |
|
"rewards/concensus_correctness_reward_func": 0.0403750017285347, |
|
"rewards/consensus_reward_func": 0.25, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.5140022300183773, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.03125, |
|
"rewards/xmlcount_reward_func": 0.07543750200420618, |
|
"step": 68 |
|
}, |
|
{ |
|
"completion_length": 445.28125, |
|
"epoch": 5.833333333333333, |
|
"grad_norm": 0.48278599977493286, |
|
"kl": 0.003939619520679116, |
|
"learning_rate": 3.807971916455325e-07, |
|
"loss": 0.0, |
|
"reward": 1.055080994963646, |
|
"reward_std": 1.2731535732746124, |
|
"rewards/concensus_correctness_reward_func": 0.04543749988079071, |
|
"rewards/consensus_reward_func": 0.3125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.125, |
|
"rewards/question_recreation_reward_func": 0.5798934735357761, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.007750006392598152, |
|
"step": 70 |
|
}, |
|
{ |
|
"completion_length": 514.09375, |
|
"epoch": 6.0, |
|
"grad_norm": 0.5504283308982849, |
|
"kl": 0.0036322681698948145, |
|
"learning_rate": 3.738294994224969e-07, |
|
"loss": 0.0, |
|
"reward": 0.7345115430653095, |
|
"reward_std": 0.9186943359673023, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.25, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.35060528852045536, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.11828124895691872, |
|
"step": 72 |
|
}, |
|
{ |
|
"completion_length": 381.0625, |
|
"epoch": 6.166666666666667, |
|
"grad_norm": 0.5463396310806274, |
|
"kl": 0.004689242952736095, |
|
"learning_rate": 3.6673192715431014e-07, |
|
"loss": 0.0, |
|
"reward": 1.040903776884079, |
|
"reward_std": 0.8909546323120594, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.3645600201562047, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.2857187483459711, |
|
"step": 74 |
|
}, |
|
{ |
|
"completion_length": 461.90625, |
|
"epoch": 6.333333333333333, |
|
"grad_norm": 0.5456697344779968, |
|
"kl": 0.003803928440902382, |
|
"learning_rate": 3.595119192141706e-07, |
|
"loss": 0.0, |
|
"reward": 1.3395836614072323, |
|
"reward_std": 1.3747756779193878, |
|
"rewards/concensus_correctness_reward_func": 0.07437499985098839, |
|
"rewards/consensus_reward_func": 0.375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.5419899076223373, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.03125, |
|
"rewards/xmlcount_reward_func": 0.3169687418267131, |
|
"step": 76 |
|
}, |
|
{ |
|
"completion_length": 491.09375, |
|
"epoch": 6.5, |
|
"grad_norm": 0.4154431223869324, |
|
"kl": 0.004837977059651166, |
|
"learning_rate": 3.5217704839338905e-07, |
|
"loss": 0.0, |
|
"reward": 0.9017609618604183, |
|
"reward_std": 1.2408568933606148, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.519823431968689, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": -0.008687485009431839, |
|
"step": 78 |
|
}, |
|
{ |
|
"completion_length": 459.5625, |
|
"epoch": 6.666666666666667, |
|
"grad_norm": 0.4849317669868469, |
|
"kl": 0.003633130807429552, |
|
"learning_rate": 3.447350079585767e-07, |
|
"loss": 0.0, |
|
"reward": 1.0950381644070148, |
|
"reward_std": 0.92024633474648, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.5, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.125, |
|
"rewards/question_recreation_reward_func": 0.4737569224089384, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.003718756139278412, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 500.5625, |
|
"epoch": 6.833333333333333, |
|
"grad_norm": 0.5481275916099548, |
|
"kl": 0.00398016459075734, |
|
"learning_rate": 3.3719360358247053e-07, |
|
"loss": 0.0, |
|
"reward": 0.981081947684288, |
|
"reward_std": 1.2141644582152367, |
|
"rewards/concensus_correctness_reward_func": 0.055250002071261406, |
|
"rewards/consensus_reward_func": 0.375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.49655068665742874, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.054281254298985004, |
|
"step": 82 |
|
}, |
|
{ |
|
"completion_length": 395.84375, |
|
"epoch": 7.0, |
|
"grad_norm": 0.4870210886001587, |
|
"kl": 0.005125438881805167, |
|
"learning_rate": 3.29560745156861e-07, |
|
"loss": 0.0, |
|
"reward": 2.616707056760788, |
|
"reward_std": 2.2652119155973196, |
|
"rewards/concensus_correctness_reward_func": 1.2730624992400408, |
|
"rewards/consensus_reward_func": 0.3125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.6221445724368095, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.3777499981224537, |
|
"step": 84 |
|
}, |
|
{ |
|
"completion_length": 397.6875, |
|
"epoch": 7.166666666666667, |
|
"grad_norm": 1.8424495458602905, |
|
"kl": 0.005850363930221647, |
|
"learning_rate": 3.218444384962071e-07, |
|
"loss": 0.0, |
|
"reward": 0.7416995037347078, |
|
"reward_std": 0.7169875521212816, |
|
"rewards/concensus_correctness_reward_func": 0.030124999582767487, |
|
"rewards/consensus_reward_func": 0.25, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.42598075792193413, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.019968749955296516, |
|
"step": 86 |
|
}, |
|
{ |
|
"completion_length": 451.84375, |
|
"epoch": 7.333333333333333, |
|
"grad_norm": 1.3289145231246948, |
|
"kl": 0.005223560554441065, |
|
"learning_rate": 3.1405277694064305e-07, |
|
"loss": 0.0, |
|
"reward": 1.037564679980278, |
|
"reward_std": 0.8682594746351242, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.5, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.4709709379822016, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.050968751311302185, |
|
"step": 88 |
|
}, |
|
{ |
|
"completion_length": 507.65625, |
|
"epoch": 7.5, |
|
"grad_norm": 0.5467274785041809, |
|
"kl": 0.00422226672526449, |
|
"learning_rate": 3.0619393286718237e-07, |
|
"loss": 0.0, |
|
"reward": 2.325454168021679, |
|
"reward_std": 2.455311745405197, |
|
"rewards/concensus_correctness_reward_func": 1.270999999716878, |
|
"rewards/consensus_reward_func": 0.375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0625, |
|
"rewards/question_recreation_reward_func": 0.6971416063606739, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": -0.09581249579787254, |
|
"step": 90 |
|
}, |
|
{ |
|
"completion_length": 431.34375, |
|
"epoch": 7.666666666666667, |
|
"grad_norm": 0.4258057773113251, |
|
"kl": 0.005024322046665475, |
|
"learning_rate": 2.98276149118022e-07, |
|
"loss": 0.0, |
|
"reward": 1.168196927756071, |
|
"reward_std": 1.1126753464341164, |
|
"rewards/concensus_correctness_reward_func": 0.02199999988079071, |
|
"rewards/consensus_reward_func": 0.3125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0625, |
|
"rewards/question_recreation_reward_func": 0.5554468911141157, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.03125, |
|
"rewards/xmlcount_reward_func": 0.18450001627206802, |
|
"step": 92 |
|
}, |
|
{ |
|
"completion_length": 420.5, |
|
"epoch": 7.833333333333333, |
|
"grad_norm": 3.349438428878784, |
|
"kl": 0.008144651423208416, |
|
"learning_rate": 2.903077303549399e-07, |
|
"loss": 0.0, |
|
"reward": 1.0509218061342835, |
|
"reward_std": 1.0600119307637215, |
|
"rewards/concensus_correctness_reward_func": 0.04312499985098839, |
|
"rewards/consensus_reward_func": 0.25, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.125, |
|
"rewards/question_recreation_reward_func": 0.39529681019484997, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.2374999923631549, |
|
"step": 94 |
|
}, |
|
{ |
|
"completion_length": 434.96875, |
|
"epoch": 8.0, |
|
"grad_norm": 0.49676457047462463, |
|
"kl": 0.00413627817761153, |
|
"learning_rate": 2.822970343488516e-07, |
|
"loss": 0.0, |
|
"reward": 0.9490811452269554, |
|
"reward_std": 0.8340496830642223, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.1875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.5972061259672046, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.16437500715255737, |
|
"step": 96 |
|
}, |
|
{ |
|
"completion_length": 403.90625, |
|
"epoch": 8.166666666666666, |
|
"grad_norm": 0.5876235961914062, |
|
"kl": 0.007058300951030105, |
|
"learning_rate": 2.7425246321366205e-07, |
|
"loss": 0.0, |
|
"reward": 1.397300623357296, |
|
"reward_std": 0.7023336328566074, |
|
"rewards/concensus_correctness_reward_func": 0.020999999716877937, |
|
"rewards/consensus_reward_func": 0.625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.583988162688911, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.15168750286102295, |
|
"step": 98 |
|
}, |
|
{ |
|
"completion_length": 533.15625, |
|
"epoch": 8.333333333333334, |
|
"grad_norm": 0.429283082485199, |
|
"kl": 0.004187059836112894, |
|
"learning_rate": 2.661824545936089e-07, |
|
"loss": 0.0, |
|
"reward": 1.2581686060875654, |
|
"reward_std": 1.170703399926424, |
|
"rewards/concensus_correctness_reward_func": 0.02306249924004078, |
|
"rewards/consensus_reward_func": 0.5, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.697043601423502, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": -0.008812484331429005, |
|
"step": 100 |
|
}, |
|
{ |
|
"completion_length": 445.3125, |
|
"epoch": 8.5, |
|
"grad_norm": 0.5875679850578308, |
|
"kl": 0.0034884795895777643, |
|
"learning_rate": 2.58095472813339e-07, |
|
"loss": 0.0, |
|
"reward": 0.9280952122062445, |
|
"reward_std": 0.9815445207059383, |
|
"rewards/concensus_correctness_reward_func": 0.0403750017285347, |
|
"rewards/consensus_reward_func": 0.1875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.5310951881110668, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.16912501025944948, |
|
"step": 102 |
|
}, |
|
{ |
|
"completion_length": 383.84375, |
|
"epoch": 8.666666666666666, |
|
"grad_norm": 0.5602811574935913, |
|
"kl": 0.00470353034324944, |
|
"learning_rate": 2.5e-07, |
|
"loss": 0.0, |
|
"reward": 1.1036290470510721, |
|
"reward_std": 1.118781317025423, |
|
"rewards/concensus_correctness_reward_func": 0.015062499791383743, |
|
"rewards/consensus_reward_func": 0.4375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.125, |
|
"rewards/question_recreation_reward_func": 0.4021915583871305, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.10825000796467066, |
|
"step": 104 |
|
}, |
|
{ |
|
"completion_length": 494.15625, |
|
"epoch": 8.833333333333334, |
|
"grad_norm": 0.45191463828086853, |
|
"kl": 0.005157338426215574, |
|
"learning_rate": 2.4190452718666105e-07, |
|
"loss": 0.0, |
|
"reward": 0.9766125809401274, |
|
"reward_std": 0.8945337496697903, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.4375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0625, |
|
"rewards/question_recreation_reward_func": 0.5054875910282135, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.028874988667666912, |
|
"step": 106 |
|
}, |
|
{ |
|
"completion_length": 495.875, |
|
"epoch": 9.0, |
|
"grad_norm": 0.5252740383148193, |
|
"kl": 0.003989085234934464, |
|
"learning_rate": 2.3381754540639106e-07, |
|
"loss": 0.0, |
|
"reward": 1.4440684113651514, |
|
"reward_std": 2.4300696589052677, |
|
"rewards/concensus_correctness_reward_func": 0.6704375147819519, |
|
"rewards/consensus_reward_func": 0.25, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.1875, |
|
"rewards/question_recreation_reward_func": 0.4195683840662241, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.08343750424683094, |
|
"step": 108 |
|
}, |
|
{ |
|
"completion_length": 548.71875, |
|
"epoch": 9.166666666666666, |
|
"grad_norm": 0.3460928201675415, |
|
"kl": 0.004296147613786161, |
|
"learning_rate": 2.2574753678633798e-07, |
|
"loss": 0.0, |
|
"reward": 1.181284163147211, |
|
"reward_std": 0.9245094396173954, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.25, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.125, |
|
"rewards/question_recreation_reward_func": 0.5359716564416885, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.2546875076368451, |
|
"step": 110 |
|
}, |
|
{ |
|
"completion_length": 407.4375, |
|
"epoch": 9.333333333333334, |
|
"grad_norm": 0.49219369888305664, |
|
"kl": 0.004819943045731634, |
|
"learning_rate": 2.1770296565114846e-07, |
|
"loss": 0.0, |
|
"reward": 0.96442811191082, |
|
"reward_std": 1.1890033707022667, |
|
"rewards/concensus_correctness_reward_func": 0.051249999552965164, |
|
"rewards/consensus_reward_func": 0.3125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.42905314173549414, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.03125, |
|
"rewards/xmlcount_reward_func": 0.12475000228732824, |
|
"step": 112 |
|
}, |
|
{ |
|
"completion_length": 462.34375, |
|
"epoch": 9.5, |
|
"grad_norm": 0.4612361788749695, |
|
"kl": 0.004889295043540187, |
|
"learning_rate": 2.0969226964506005e-07, |
|
"loss": 0.0, |
|
"reward": 0.9236417841166258, |
|
"reward_std": 0.9137359261512756, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.5326105281710625, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.00040626712143421173, |
|
"step": 114 |
|
}, |
|
{ |
|
"completion_length": 436.25, |
|
"epoch": 9.666666666666666, |
|
"grad_norm": 0.42146608233451843, |
|
"kl": 0.004314293968491256, |
|
"learning_rate": 2.0172385088197803e-07, |
|
"loss": 0.0, |
|
"reward": 0.8642384810373187, |
|
"reward_std": 0.9441619887948036, |
|
"rewards/concensus_correctness_reward_func": 0.010999999940395355, |
|
"rewards/consensus_reward_func": 0.0625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.5045822560787201, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.27053125388920307, |
|
"step": 116 |
|
}, |
|
{ |
|
"completion_length": 414.46875, |
|
"epoch": 9.833333333333334, |
|
"grad_norm": 1.0195592641830444, |
|
"kl": 0.007958032132592052, |
|
"learning_rate": 1.9380606713281772e-07, |
|
"loss": 0.0, |
|
"reward": 2.222010016441345, |
|
"reward_std": 2.534605525434017, |
|
"rewards/concensus_correctness_reward_func": 0.625, |
|
"rewards/consensus_reward_func": 0.4375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.1875, |
|
"rewards/question_recreation_reward_func": 0.5085724969394505, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.4478124938905239, |
|
"step": 118 |
|
}, |
|
{ |
|
"completion_length": 473.09375, |
|
"epoch": 10.0, |
|
"grad_norm": 0.5131120681762695, |
|
"kl": 0.0043090580438729376, |
|
"learning_rate": 1.859472230593569e-07, |
|
"loss": 0.0, |
|
"reward": 0.8962987270206213, |
|
"reward_std": 1.2116570584475994, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.25, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.6050799824297428, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.02559376321732998, |
|
"step": 120 |
|
}, |
|
{ |
|
"completion_length": 493.90625, |
|
"epoch": 10.166666666666666, |
|
"grad_norm": 0.563905656337738, |
|
"kl": 0.0038846915995236486, |
|
"learning_rate": 1.7815556150379296e-07, |
|
"loss": 0.0, |
|
"reward": 1.1194737702608109, |
|
"reward_std": 1.1054411605000496, |
|
"rewards/concensus_correctness_reward_func": 0.010999999940395355, |
|
"rewards/consensus_reward_func": 0.4375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.5926925111562014, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.07828125357627869, |
|
"step": 122 |
|
}, |
|
{ |
|
"completion_length": 433.03125, |
|
"epoch": 10.333333333333334, |
|
"grad_norm": 0.8397959470748901, |
|
"kl": 0.0044599128887057304, |
|
"learning_rate": 1.704392548431391e-07, |
|
"loss": 0.0, |
|
"reward": 1.2151572033762932, |
|
"reward_std": 1.2529273331165314, |
|
"rewards/concensus_correctness_reward_func": 0.0403750017285347, |
|
"rewards/consensus_reward_func": 0.3125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.6728134155273438, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.1894687432795763, |
|
"step": 124 |
|
}, |
|
{ |
|
"completion_length": 568.875, |
|
"epoch": 10.5, |
|
"grad_norm": 0.41829314827919006, |
|
"kl": 0.004545757925370708, |
|
"learning_rate": 1.6280639641752942e-07, |
|
"loss": 0.0, |
|
"reward": 1.2793281599879265, |
|
"reward_std": 1.0452781841158867, |
|
"rewards/concensus_correctness_reward_func": 0.014875000342726707, |
|
"rewards/consensus_reward_func": 0.375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.42804690077900887, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.46140626445412636, |
|
"step": 126 |
|
}, |
|
{ |
|
"completion_length": 476.28125, |
|
"epoch": 10.666666666666666, |
|
"grad_norm": 0.4802101254463196, |
|
"kl": 0.004307686380343512, |
|
"learning_rate": 1.552649920414233e-07, |
|
"loss": 0.0, |
|
"reward": 0.6176988333463669, |
|
"reward_std": 0.7555880509316921, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.0625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0625, |
|
"rewards/question_recreation_reward_func": 0.33288631308823824, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.14418751001358032, |
|
"step": 128 |
|
}, |
|
{ |
|
"completion_length": 398.25, |
|
"epoch": 10.833333333333334, |
|
"grad_norm": 0.7297332882881165, |
|
"kl": 0.005738754698541015, |
|
"learning_rate": 1.47822951606611e-07, |
|
"loss": 0.0, |
|
"reward": 1.0000969618558884, |
|
"reward_std": 0.8809923827648163, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.25, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0625, |
|
"rewards/question_recreation_reward_func": 0.5464094616472721, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.1255625020712614, |
|
"step": 130 |
|
}, |
|
{ |
|
"completion_length": 441.96875, |
|
"epoch": 11.0, |
|
"grad_norm": 0.4999024271965027, |
|
"kl": 0.004563337104627863, |
|
"learning_rate": 1.4048808078582942e-07, |
|
"loss": 0.0, |
|
"reward": 1.2324171178042889, |
|
"reward_std": 1.470774844288826, |
|
"rewards/concensus_correctness_reward_func": 0.025624999776482582, |
|
"rewards/consensus_reward_func": 0.4375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0625, |
|
"rewards/question_recreation_reward_func": 0.6081358455121517, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.0830312529578805, |
|
"step": 132 |
|
}, |
|
{ |
|
"completion_length": 496.75, |
|
"epoch": 11.166666666666666, |
|
"grad_norm": 0.6821574568748474, |
|
"kl": 0.00595690673799254, |
|
"learning_rate": 1.3326807284568984e-07, |
|
"loss": 0.0, |
|
"reward": 1.0568560734391212, |
|
"reward_std": 0.9344909712672234, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.25, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.6236373111605644, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.16759375110268593, |
|
"step": 134 |
|
}, |
|
{ |
|
"completion_length": 458.25, |
|
"epoch": 11.333333333333334, |
|
"grad_norm": 0.37327247858047485, |
|
"kl": 0.004472251705010422, |
|
"learning_rate": 1.261705005775032e-07, |
|
"loss": 0.0, |
|
"reward": 0.8789136372506618, |
|
"reward_std": 1.044730570167303, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.3125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0625, |
|
"rewards/question_recreation_reward_func": 0.4194761496037245, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.03125, |
|
"rewards/xmlcount_reward_func": 0.05318749602884054, |
|
"step": 136 |
|
}, |
|
{ |
|
"completion_length": 449.96875, |
|
"epoch": 11.5, |
|
"grad_norm": 0.6042804718017578, |
|
"kl": 0.004963608211255632, |
|
"learning_rate": 1.1920280835446748e-07, |
|
"loss": 0.0, |
|
"reward": 0.7906294241547585, |
|
"reward_std": 0.8892590645700693, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.25, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.4917856939136982, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.04884374141693115, |
|
"step": 138 |
|
}, |
|
{ |
|
"completion_length": 428.78125, |
|
"epoch": 11.666666666666666, |
|
"grad_norm": 0.43670710921287537, |
|
"kl": 0.004157427014433779, |
|
"learning_rate": 1.123723043235491e-07, |
|
"loss": 0.0, |
|
"reward": 0.9911715611815453, |
|
"reward_std": 1.0859125535935163, |
|
"rewards/concensus_correctness_reward_func": 0.051249999552965164, |
|
"rewards/consensus_reward_func": 0.375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.6306090354919434, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.0656874980777502, |
|
"step": 140 |
|
}, |
|
{ |
|
"completion_length": 501.96875, |
|
"epoch": 11.833333333333334, |
|
"grad_norm": 0.36982962489128113, |
|
"kl": 0.004740105330711231, |
|
"learning_rate": 1.056861527402452e-07, |
|
"loss": 0.0, |
|
"reward": 1.4135117419064045, |
|
"reward_std": 2.245922774076462, |
|
"rewards/concensus_correctness_reward_func": 0.625, |
|
"rewards/consensus_reward_func": 0.25, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.47179301269352436, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.051093748304992914, |
|
"step": 142 |
|
}, |
|
{ |
|
"completion_length": 451.75, |
|
"epoch": 12.0, |
|
"grad_norm": 0.3974417448043823, |
|
"kl": 0.0040429688087897375, |
|
"learning_rate": 9.915136645426883e-08, |
|
"loss": 0.0, |
|
"reward": 0.7763103228062391, |
|
"reward_std": 1.1694690249860287, |
|
"rewards/concensus_correctness_reward_func": 0.05543750151991844, |
|
"rewards/consensus_reward_func": 0.1875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.125, |
|
"rewards/question_recreation_reward_func": 0.44059158489108086, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": -0.03221874684095383, |
|
"step": 144 |
|
}, |
|
{ |
|
"completion_length": 451.84375, |
|
"epoch": 12.166666666666666, |
|
"grad_norm": 0.4643774628639221, |
|
"kl": 0.005262433871394023, |
|
"learning_rate": 9.277479955403886e-08, |
|
"loss": 0.0, |
|
"reward": 0.8141104970127344, |
|
"reward_std": 1.1598312184214592, |
|
"rewards/concensus_correctness_reward_func": 0.06631249934434891, |
|
"rewards/consensus_reward_func": 0.375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.3407667353749275, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.03203125298023224, |
|
"step": 146 |
|
}, |
|
{ |
|
"completion_length": 474.53125, |
|
"epoch": 12.333333333333334, |
|
"grad_norm": 0.373272567987442, |
|
"kl": 0.004522563278442249, |
|
"learning_rate": 8.656314017768693e-08, |
|
"loss": 0.0, |
|
"reward": 1.0826607719063759, |
|
"reward_std": 0.8991422578692436, |
|
"rewards/concensus_correctness_reward_func": 0.010999999940395355, |
|
"rewards/consensus_reward_func": 0.25, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.6882857866585255, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.11775000300258398, |
|
"step": 148 |
|
}, |
|
{ |
|
"completion_length": 520.5, |
|
"epoch": 12.5, |
|
"grad_norm": 0.5331616997718811, |
|
"kl": 0.004154504800681025, |
|
"learning_rate": 8.052290349812419e-08, |
|
"loss": 0.0, |
|
"reward": 0.7753689587116241, |
|
"reward_std": 0.855584591627121, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.3125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.4509002063423395, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.011968761682510376, |
|
"step": 150 |
|
}, |
|
{ |
|
"completion_length": 427.5, |
|
"epoch": 12.666666666666666, |
|
"grad_norm": 0.4988049566745758, |
|
"kl": 0.00503876109723933, |
|
"learning_rate": 7.46604248895252e-08, |
|
"loss": 0.0, |
|
"reward": 2.609547968953848, |
|
"reward_std": 2.4432314597070217, |
|
"rewards/concensus_correctness_reward_func": 1.25, |
|
"rewards/consensus_reward_func": 0.4375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0625, |
|
"rewards/question_recreation_reward_func": 0.6731417663395405, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.17078125616535544, |
|
"step": 152 |
|
}, |
|
{ |
|
"completion_length": 407.5, |
|
"epoch": 12.833333333333334, |
|
"grad_norm": 0.4662899971008301, |
|
"kl": 0.004989988505258225, |
|
"learning_rate": 6.898185328239467e-08, |
|
"loss": 0.0, |
|
"reward": 1.0029053278267384, |
|
"reward_std": 0.9076509363949299, |
|
"rewards/concensus_correctness_reward_func": 0.0403750017285347, |
|
"rewards/consensus_reward_func": 0.3125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0625, |
|
"rewards/question_recreation_reward_func": 0.46990528982132673, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.03125, |
|
"rewards/xmlcount_reward_func": 0.08637500088661909, |
|
"step": 154 |
|
}, |
|
{ |
|
"completion_length": 611.75, |
|
"epoch": 13.0, |
|
"grad_norm": 0.40727847814559937, |
|
"kl": 0.0040196322806878015, |
|
"learning_rate": 6.349314471418849e-08, |
|
"loss": 0.0, |
|
"reward": 0.8748770579695702, |
|
"reward_std": 1.1846881732344627, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.25, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.5004395483992994, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.03125, |
|
"rewards/xmlcount_reward_func": 0.09318749979138374, |
|
"step": 156 |
|
}, |
|
{ |
|
"completion_length": 435.15625, |
|
"epoch": 13.166666666666666, |
|
"grad_norm": 0.778713583946228, |
|
"kl": 0.005932875850703567, |
|
"learning_rate": 5.8200056082253453e-08, |
|
"loss": 0.0, |
|
"reward": 1.1377633288502693, |
|
"reward_std": 1.0723667368292809, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.518325824290514, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.03125, |
|
"rewards/xmlcount_reward_func": 0.2131875054910779, |
|
"step": 158 |
|
}, |
|
{ |
|
"completion_length": 405.28125, |
|
"epoch": 13.333333333333334, |
|
"grad_norm": 0.6267311573028564, |
|
"kl": 0.00516361856716685, |
|
"learning_rate": 5.310813910563644e-08, |
|
"loss": 0.0, |
|
"reward": 1.4793311096727848, |
|
"reward_std": 1.0320269502699375, |
|
"rewards/concensus_correctness_reward_func": 0.04312499985098839, |
|
"rewards/consensus_reward_func": 0.375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0625, |
|
"rewards/question_recreation_reward_func": 0.5567373372614384, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.03125, |
|
"rewards/xmlcount_reward_func": 0.4107187641784549, |
|
"step": 160 |
|
}, |
|
{ |
|
"completion_length": 417.375, |
|
"epoch": 13.5, |
|
"grad_norm": 0.7477406859397888, |
|
"kl": 0.007013184251263738, |
|
"learning_rate": 4.8222734502097655e-08, |
|
"loss": 0.0, |
|
"reward": 1.0498220138251781, |
|
"reward_std": 1.0254092812538147, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.4375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.3807595409452915, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.03125, |
|
"rewards/xmlcount_reward_func": 0.20031250617466867, |
|
"step": 162 |
|
}, |
|
{ |
|
"completion_length": 524.9375, |
|
"epoch": 13.666666666666666, |
|
"grad_norm": 0.3479692041873932, |
|
"kl": 0.003795241893385537, |
|
"learning_rate": 4.35489663864359e-08, |
|
"loss": 0.0, |
|
"reward": 0.8717399332672358, |
|
"reward_std": 0.986246095970273, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.4375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.5171773973852396, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": -0.09856249298900366, |
|
"step": 164 |
|
}, |
|
{ |
|
"completion_length": 417.5, |
|
"epoch": 13.833333333333334, |
|
"grad_norm": 0.4246520698070526, |
|
"kl": 0.00443370349239558, |
|
"learning_rate": 3.90917368959989e-08, |
|
"loss": 0.0, |
|
"reward": 2.2922814451158047, |
|
"reward_std": 2.770563669502735, |
|
"rewards/concensus_correctness_reward_func": 1.2991249989718199, |
|
"rewards/consensus_reward_func": 0.375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0625, |
|
"rewards/question_recreation_reward_func": 0.5883127357810736, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": -0.048281239345669746, |
|
"step": 166 |
|
}, |
|
{ |
|
"completion_length": 454.21875, |
|
"epoch": 14.0, |
|
"grad_norm": 0.6315698027610779, |
|
"kl": 0.006786162994103506, |
|
"learning_rate": 3.485572104901868e-08, |
|
"loss": 0.0, |
|
"reward": 0.9570262767374516, |
|
"reward_std": 0.9035042636096478, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.0625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.5563387721776962, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.03125, |
|
"rewards/xmlcount_reward_func": 0.30693749710917473, |
|
"step": 168 |
|
}, |
|
{ |
|
"completion_length": 510.4375, |
|
"epoch": 14.166666666666666, |
|
"grad_norm": 0.34782856702804565, |
|
"kl": 0.0037368796474765986, |
|
"learning_rate": 3.08453618411631e-08, |
|
"loss": 0.0, |
|
"reward": 0.8143082866445184, |
|
"reward_std": 1.0274437740445137, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.25, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.44208951387554407, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.10659373924136162, |
|
"step": 170 |
|
}, |
|
{ |
|
"completion_length": 510.75, |
|
"epoch": 14.333333333333334, |
|
"grad_norm": 0.34581589698791504, |
|
"kl": 0.0037871022359468043, |
|
"learning_rate": 2.7064865585446433e-08, |
|
"loss": 0.0, |
|
"reward": 0.8895941041409969, |
|
"reward_std": 1.2179592177271843, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.3125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.5555003546178341, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.005968746729195118, |
|
"step": 172 |
|
}, |
|
{ |
|
"completion_length": 425.09375, |
|
"epoch": 14.5, |
|
"grad_norm": 1.2589985132217407, |
|
"kl": 0.00627510278718546, |
|
"learning_rate": 2.3518197500388276e-08, |
|
"loss": 0.0, |
|
"reward": 1.2998529449105263, |
|
"reward_std": 0.9905166923999786, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.3125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.5617592297494411, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.046875, |
|
"rewards/xmlcount_reward_func": 0.3787187491543591, |
|
"step": 174 |
|
}, |
|
{ |
|
"completion_length": 414.71875, |
|
"epoch": 14.666666666666666, |
|
"grad_norm": 0.5357417464256287, |
|
"kl": 0.005580049444688484, |
|
"learning_rate": 2.0209077551046976e-08, |
|
"loss": 0.0, |
|
"reward": 1.0539772361516953, |
|
"reward_std": 1.2460423409938812, |
|
"rewards/concensus_correctness_reward_func": 0.02199999988079071, |
|
"rewards/consensus_reward_func": 0.375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.43419601768255234, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.20715624745935202, |
|
"step": 176 |
|
}, |
|
{ |
|
"completion_length": 429.375, |
|
"epoch": 14.833333333333334, |
|
"grad_norm": 1.0408557653427124, |
|
"kl": 0.005757150123827159, |
|
"learning_rate": 1.7140976547289438e-08, |
|
"loss": 0.0, |
|
"reward": 0.9175567002967, |
|
"reward_std": 0.979901023209095, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.4002129649743438, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.12671875953674316, |
|
"step": 178 |
|
}, |
|
{ |
|
"completion_length": 420.3125, |
|
"epoch": 15.0, |
|
"grad_norm": 7.589697360992432, |
|
"kl": 0.007644714612979442, |
|
"learning_rate": 1.4317112503391432e-08, |
|
"loss": 0.0, |
|
"reward": 2.202411949634552, |
|
"reward_std": 2.396846577525139, |
|
"rewards/concensus_correctness_reward_func": 0.6762499995529652, |
|
"rewards/consensus_reward_func": 0.5, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0625, |
|
"rewards/question_recreation_reward_func": 0.5486306510865688, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.03125, |
|
"rewards/xmlcount_reward_func": 0.38378125987946987, |
|
"step": 180 |
|
}, |
|
{ |
|
"completion_length": 358.25, |
|
"epoch": 15.166666666666666, |
|
"grad_norm": 0.44812634587287903, |
|
"kl": 0.006337179249385372, |
|
"learning_rate": 1.174044726278478e-08, |
|
"loss": 0.0, |
|
"reward": 1.5023312643170357, |
|
"reward_std": 0.868278194218874, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.5625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.5742063000798225, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.365625005797483, |
|
"step": 182 |
|
}, |
|
{ |
|
"completion_length": 490.96875, |
|
"epoch": 15.333333333333334, |
|
"grad_norm": 0.4610471725463867, |
|
"kl": 0.004114439827390015, |
|
"learning_rate": 9.413683391492455e-09, |
|
"loss": 0.0, |
|
"reward": 0.9374719671905041, |
|
"reward_std": 1.0686773546040058, |
|
"rewards/concensus_correctness_reward_func": 0.04312499985098839, |
|
"rewards/consensus_reward_func": 0.1875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.6183782331645489, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.07284376211464405, |
|
"step": 184 |
|
}, |
|
{ |
|
"completion_length": 494.375, |
|
"epoch": 15.5, |
|
"grad_norm": 0.4306413233280182, |
|
"kl": 0.005119932699017227, |
|
"learning_rate": 7.339261343510206e-09, |
|
"loss": 0.0, |
|
"reward": 2.5885951071977615, |
|
"reward_std": 2.4889354780316353, |
|
"rewards/concensus_correctness_reward_func": 1.2648750003427267, |
|
"rewards/consensus_reward_func": 0.5, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.125, |
|
"rewards/question_recreation_reward_func": 0.5245639234781265, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.15853123925626278, |
|
"step": 186 |
|
}, |
|
{ |
|
"completion_length": 517.625, |
|
"epoch": 15.666666666666666, |
|
"grad_norm": 0.5044927597045898, |
|
"kl": 0.003461526444880292, |
|
"learning_rate": 5.519356901107358e-09, |
|
"loss": 0.0, |
|
"reward": 0.6680111261084676, |
|
"reward_std": 1.0589893124997616, |
|
"rewards/concensus_correctness_reward_func": 0.0403750017285347, |
|
"rewards/consensus_reward_func": 0.1875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0625, |
|
"rewards/question_recreation_reward_func": 0.5050736283883452, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.03125, |
|
"rewards/xmlcount_reward_func": -0.15868749096989632, |
|
"step": 188 |
|
}, |
|
{ |
|
"completion_length": 502.28125, |
|
"epoch": 15.833333333333334, |
|
"grad_norm": 0.4112931489944458, |
|
"kl": 0.004242730807163753, |
|
"learning_rate": 3.95587889273144e-09, |
|
"loss": 0.0, |
|
"reward": 1.0356206335127354, |
|
"reward_std": 1.2721525505185127, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.25, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.5705581326037645, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0625, |
|
"rewards/xmlcount_reward_func": 0.15256250742822886, |
|
"step": 190 |
|
}, |
|
{ |
|
"completion_length": 447.75, |
|
"epoch": 16.0, |
|
"grad_norm": 0.5173096060752869, |
|
"kl": 0.0055172459105961025, |
|
"learning_rate": 2.6504671909109988e-09, |
|
"loss": 0.0, |
|
"reward": 1.2947994247078896, |
|
"reward_std": 0.9049638472497463, |
|
"rewards/concensus_correctness_reward_func": 0.010999999940395355, |
|
"rewards/consensus_reward_func": 0.625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.41786191053688526, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.015625, |
|
"rewards/xmlcount_reward_func": 0.22531251050531864, |
|
"step": 192 |
|
}, |
|
{ |
|
"completion_length": 486.375, |
|
"epoch": 16.166666666666668, |
|
"grad_norm": 0.47947174310684204, |
|
"kl": 0.004277107727830298, |
|
"learning_rate": 1.6044909922555972e-09, |
|
"loss": 0.0, |
|
"reward": 0.6717201061546803, |
|
"reward_std": 0.9776397682726383, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.1875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.4134075944311917, |
|
"rewards/soft_format_reward_func": 0.015625, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.05518750101327896, |
|
"step": 194 |
|
}, |
|
{ |
|
"completion_length": 426.0, |
|
"epoch": 16.333333333333332, |
|
"grad_norm": 0.6205836534500122, |
|
"kl": 0.0041899179050233215, |
|
"learning_rate": 8.19047381357657e-10, |
|
"loss": 0.0, |
|
"reward": 1.1252982020378113, |
|
"reward_std": 0.8293185122311115, |
|
"rewards/concensus_correctness_reward_func": 0.0, |
|
"rewards/consensus_reward_func": 0.375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.6142044588923454, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.03125, |
|
"rewards/xmlcount_reward_func": 0.10484375571832061, |
|
"step": 196 |
|
}, |
|
{ |
|
"completion_length": 475.28125, |
|
"epoch": 16.5, |
|
"grad_norm": 0.4393874406814575, |
|
"kl": 0.004755317640956491, |
|
"learning_rate": 2.949601801023327e-10, |
|
"loss": 0.0, |
|
"reward": 2.9790390357375145, |
|
"reward_std": 2.1492477003484964, |
|
"rewards/concensus_correctness_reward_func": 1.8859999999403954, |
|
"rewards/consensus_reward_func": 0.375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0625, |
|
"rewards/question_recreation_reward_func": 0.6341639496386051, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.021374999545514584, |
|
"step": 198 |
|
}, |
|
{ |
|
"completion_length": 396.65625, |
|
"epoch": 16.666666666666668, |
|
"grad_norm": 0.9207046031951904, |
|
"kl": 0.005724833768908866, |
|
"learning_rate": 3.2779083591949474e-11, |
|
"loss": 0.0, |
|
"reward": 1.5880182441323996, |
|
"reward_std": 1.0737088397145271, |
|
"rewards/concensus_correctness_reward_func": 0.0403750017285347, |
|
"rewards/consensus_reward_func": 0.625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0625, |
|
"rewards/question_recreation_reward_func": 0.5196432434022427, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.0, |
|
"rewards/xmlcount_reward_func": 0.34050000831484795, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 16.666666666666668, |
|
"step": 200, |
|
"total_flos": 0.0, |
|
"train_loss": 4.575966534048348e-06, |
|
"train_runtime": 9569.7045, |
|
"train_samples_per_second": 0.334, |
|
"train_steps_per_second": 0.021 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 200, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 17, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|