|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 5.714285714285714, |
|
"eval_steps": 500, |
|
"global_step": 200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 341.40625, |
|
"epoch": 0.05714285714285714, |
|
"grad_norm": 596.5550537109375, |
|
"kl": 0.0, |
|
"learning_rate": 1.6666666666666665e-07, |
|
"loss": 0.0, |
|
"reward": 3.286102021113038, |
|
"reward_std": 1.2568062348291278, |
|
"rewards/concensus_correctness_reward_func": 0.9363125078380108, |
|
"rewards/consensus_reward_func": 0.75, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.1875, |
|
"rewards/question_recreation_reward_func": 0.6091644916159566, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.15625, |
|
"rewards/xmlcount_reward_func": 0.6468749991327059, |
|
"step": 2 |
|
}, |
|
{ |
|
"completion_length": 267.03125, |
|
"epoch": 0.11428571428571428, |
|
"grad_norm": 8.391526222229004, |
|
"kl": 0.03340096258034464, |
|
"learning_rate": 5e-07, |
|
"loss": 0.0, |
|
"reward": 6.290437173098326, |
|
"reward_std": 0.8421196703563254, |
|
"rewards/concensus_correctness_reward_func": 1.8721874952316284, |
|
"rewards/consensus_reward_func": 1.6875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.375, |
|
"rewards/question_recreation_reward_func": 0.8401246860812535, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.421875, |
|
"rewards/xmlcount_reward_func": 1.09375, |
|
"step": 4 |
|
}, |
|
{ |
|
"completion_length": 210.40625, |
|
"epoch": 0.17142857142857143, |
|
"grad_norm": 15.22243881225586, |
|
"kl": 2027.1362594434759, |
|
"learning_rate": 8.333333333333333e-07, |
|
"loss": 2.0271, |
|
"reward": 6.841284893453121, |
|
"reward_std": 0.4849709497721051, |
|
"rewards/concensus_correctness_reward_func": 2.0985624939203262, |
|
"rewards/consensus_reward_func": 1.75, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.5, |
|
"rewards/question_recreation_reward_func": 0.9067848596605472, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4375, |
|
"rewards/xmlcount_reward_func": 1.1484375, |
|
"step": 6 |
|
}, |
|
{ |
|
"completion_length": 201.5, |
|
"epoch": 0.22857142857142856, |
|
"grad_norm": 5023845.0, |
|
"kl": 491715.70446118125, |
|
"learning_rate": 9.99934441832816e-07, |
|
"loss": 491.7157, |
|
"reward": 6.59922556579113, |
|
"reward_std": 1.0329566281288862, |
|
"rewards/concensus_correctness_reward_func": 1.9156874902546406, |
|
"rewards/consensus_reward_func": 1.8125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.3125, |
|
"rewards/question_recreation_reward_func": 0.9257256090641022, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.453125, |
|
"rewards/xmlcount_reward_func": 1.1796875, |
|
"step": 8 |
|
}, |
|
{ |
|
"completion_length": 241.46875, |
|
"epoch": 0.2857142857142857, |
|
"grad_norm": 75.62132263183594, |
|
"kl": 0.7261713498155586, |
|
"learning_rate": 9.994100796397953e-07, |
|
"loss": 0.0007, |
|
"reward": 6.299689278006554, |
|
"reward_std": 1.1271193381398916, |
|
"rewards/concensus_correctness_reward_func": 1.8690624944865704, |
|
"rewards/consensus_reward_func": 1.75, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.25, |
|
"rewards/question_recreation_reward_func": 0.8681267369538546, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4375, |
|
"rewards/xmlcount_reward_func": 1.125, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 135.90625, |
|
"epoch": 0.34285714285714286, |
|
"grad_norm": 13.225509643554688, |
|
"kl": 6.158389857970178, |
|
"learning_rate": 9.983619052372847e-07, |
|
"loss": 0.0062, |
|
"reward": 7.896404385566711, |
|
"reward_std": 0.2962362109683454, |
|
"rewards/concensus_correctness_reward_func": 2.4846875071525574, |
|
"rewards/consensus_reward_func": 1.9375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.75, |
|
"rewards/question_recreation_reward_func": 0.9976543560624123, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.484375, |
|
"rewards/xmlcount_reward_func": 1.2421875, |
|
"step": 12 |
|
}, |
|
{ |
|
"completion_length": 185.53125, |
|
"epoch": 0.4, |
|
"grad_norm": 1333.3822021484375, |
|
"kl": 140.61930383229628, |
|
"learning_rate": 9.967910180154888e-07, |
|
"loss": 0.1406, |
|
"reward": 6.950483754277229, |
|
"reward_std": 0.5596978962421417, |
|
"rewards/concensus_correctness_reward_func": 2.1063749976456165, |
|
"rewards/consensus_reward_func": 1.875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.375, |
|
"rewards/question_recreation_reward_func": 0.9378588311374187, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.46875, |
|
"rewards/xmlcount_reward_func": 1.1875, |
|
"step": 14 |
|
}, |
|
{ |
|
"completion_length": 160.5, |
|
"epoch": 0.45714285714285713, |
|
"grad_norm": 1.1600862741470337, |
|
"kl": 16.362293783109635, |
|
"learning_rate": 9.946990656181779e-07, |
|
"loss": 0.0164, |
|
"reward": 6.930127799510956, |
|
"reward_std": 0.3672862723469734, |
|
"rewards/concensus_correctness_reward_func": 2.041000008583069, |
|
"rewards/consensus_reward_func": 1.9375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.3125, |
|
"rewards/question_recreation_reward_func": 0.936002803966403, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.484375, |
|
"rewards/xmlcount_reward_func": 1.21875, |
|
"step": 16 |
|
}, |
|
{ |
|
"completion_length": 212.8125, |
|
"epoch": 0.5142857142857142, |
|
"grad_norm": 19658.451171875, |
|
"kl": 2282.9115716170054, |
|
"learning_rate": 9.92088242214537e-07, |
|
"loss": 2.2829, |
|
"reward": 6.798925548791885, |
|
"reward_std": 1.1088980715867365, |
|
"rewards/concensus_correctness_reward_func": 2.1001250073313713, |
|
"rewards/consensus_reward_func": 1.8125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.4375, |
|
"rewards/question_recreation_reward_func": 0.8745817970484495, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4375, |
|
"rewards/xmlcount_reward_func": 1.13671875, |
|
"step": 18 |
|
}, |
|
{ |
|
"completion_length": 159.53125, |
|
"epoch": 0.5714285714285714, |
|
"grad_norm": 31.297624588012695, |
|
"kl": 0.9011318488046527, |
|
"learning_rate": 9.889612861977853e-07, |
|
"loss": 0.0009, |
|
"reward": 6.927322618663311, |
|
"reward_std": 0.028847315654275008, |
|
"rewards/concensus_correctness_reward_func": 2.102874994277954, |
|
"rewards/consensus_reward_func": 1.875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.375, |
|
"rewards/question_recreation_reward_func": 0.9377288408577442, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.453125, |
|
"rewards/xmlcount_reward_func": 1.18359375, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 153.4375, |
|
"epoch": 0.6285714285714286, |
|
"grad_norm": 60.826229095458984, |
|
"kl": 1.329597746487707, |
|
"learning_rate": 9.853214773129795e-07, |
|
"loss": 0.0013, |
|
"reward": 6.199776213616133, |
|
"reward_std": 0.005127147152961697, |
|
"rewards/concensus_correctness_reward_func": 1.7282500192523003, |
|
"rewards/consensus_reward_func": 1.875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.9402761983219534, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.46875, |
|
"rewards/xmlcount_reward_func": 1.1875, |
|
"step": 22 |
|
}, |
|
{ |
|
"completion_length": 179.5, |
|
"epoch": 0.6857142857142857, |
|
"grad_norm": 775657472.0, |
|
"kl": 50355153.60608631, |
|
"learning_rate": 9.81172633217015e-07, |
|
"loss": 50355.1562, |
|
"reward": 6.3887627720832825, |
|
"reward_std": 1.3089241795241833, |
|
"rewards/concensus_correctness_reward_func": 1.963062521070242, |
|
"rewards/consensus_reward_func": 1.875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.4375, |
|
"rewards/question_recreation_reward_func": 0.8848253078758717, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4375, |
|
"rewards/xmlcount_reward_func": 0.7908749878406525, |
|
"step": 24 |
|
}, |
|
{ |
|
"completion_length": 151.46875, |
|
"epoch": 0.7428571428571429, |
|
"grad_norm": 14.245716094970703, |
|
"kl": 2.350977373425849, |
|
"learning_rate": 9.765191054744304e-07, |
|
"loss": 0.0024, |
|
"reward": 5.939617916941643, |
|
"reward_std": 0.934625256806612, |
|
"rewards/concensus_correctness_reward_func": 1.749625001102686, |
|
"rewards/consensus_reward_func": 1.625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.1875, |
|
"rewards/question_recreation_reward_func": 0.8888366278260946, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.40625, |
|
"rewards/xmlcount_reward_func": 1.082406248897314, |
|
"step": 26 |
|
}, |
|
{ |
|
"completion_length": 128.34375, |
|
"epoch": 0.8, |
|
"grad_norm": 48335.97265625, |
|
"kl": 1940.711479806574, |
|
"learning_rate": 9.713657749932171e-07, |
|
"loss": 1.9407, |
|
"reward": 6.796030431985855, |
|
"reward_std": 0.0927647277712822, |
|
"rewards/concensus_correctness_reward_func": 1.9866250082850456, |
|
"rewards/consensus_reward_func": 2.0, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.125, |
|
"rewards/question_recreation_reward_func": 0.9695616886019707, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.484375, |
|
"rewards/xmlcount_reward_func": 1.23046875, |
|
"step": 28 |
|
}, |
|
{ |
|
"completion_length": 122.46875, |
|
"epoch": 0.8571428571428571, |
|
"grad_norm": 0.2548629343509674, |
|
"kl": 22700.092740163207, |
|
"learning_rate": 9.657180469054212e-07, |
|
"loss": 22.7001, |
|
"reward": 7.125140815973282, |
|
"reward_std": 0.32153427973389626, |
|
"rewards/concensus_correctness_reward_func": 2.1701249927282333, |
|
"rewards/consensus_reward_func": 1.9375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.375, |
|
"rewards/question_recreation_reward_func": 0.9667346738278866, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.46875, |
|
"rewards/xmlcount_reward_func": 1.20703125, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 156.9375, |
|
"epoch": 0.9142857142857143, |
|
"grad_norm": 1398.3612060546875, |
|
"kl": 123.12271721323486, |
|
"learning_rate": 9.59581844897906e-07, |
|
"loss": 0.1231, |
|
"reward": 7.055217877030373, |
|
"reward_std": 0.24068230390548706, |
|
"rewards/concensus_correctness_reward_func": 2.1315624937415123, |
|
"rewards/consensus_reward_func": 1.9375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.375, |
|
"rewards/question_recreation_reward_func": 0.9392804062226787, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.453125, |
|
"rewards/xmlcount_reward_func": 1.21875, |
|
"step": 32 |
|
}, |
|
{ |
|
"completion_length": 240.0, |
|
"epoch": 0.9714285714285714, |
|
"grad_norm": 186.86839294433594, |
|
"kl": 152.3791024107486, |
|
"learning_rate": 9.529636049992233e-07, |
|
"loss": 0.1524, |
|
"reward": 6.90338921546936, |
|
"reward_std": 1.3376336731016636, |
|
"rewards/concensus_correctness_reward_func": 2.182187505066395, |
|
"rewards/consensus_reward_func": 1.75, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.5625, |
|
"rewards/question_recreation_reward_func": 0.8657329957932234, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.421875, |
|
"rewards/xmlcount_reward_func": 1.12109375, |
|
"step": 34 |
|
}, |
|
{ |
|
"completion_length": 247.90625, |
|
"epoch": 1.0285714285714285, |
|
"grad_norm": 326.3687438964844, |
|
"kl": 219.12903738673776, |
|
"learning_rate": 9.458702688291071e-07, |
|
"loss": 0.2191, |
|
"reward": 5.31565772742033, |
|
"reward_std": 0.9229803088819608, |
|
"rewards/concensus_correctness_reward_func": 1.3946250043809414, |
|
"rewards/consensus_reward_func": 1.5625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.125, |
|
"rewards/question_recreation_reward_func": 0.8155639320611954, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.359375, |
|
"rewards/xmlcount_reward_func": 1.05859375, |
|
"step": 36 |
|
}, |
|
{ |
|
"completion_length": 211.09375, |
|
"epoch": 1.0857142857142856, |
|
"grad_norm": 23.403614044189453, |
|
"kl": 3.9675100842723623, |
|
"learning_rate": 9.383092763176738e-07, |
|
"loss": 0.004, |
|
"reward": 6.58778091520071, |
|
"reward_std": 1.721142528578639, |
|
"rewards/concensus_correctness_reward_func": 2.0522499941289425, |
|
"rewards/consensus_reward_func": 1.625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.5625, |
|
"rewards/question_recreation_reward_func": 0.8519372157752514, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.40625, |
|
"rewards/xmlcount_reward_func": 1.08984375, |
|
"step": 38 |
|
}, |
|
{ |
|
"completion_length": 218.0625, |
|
"epoch": 1.1428571428571428, |
|
"grad_norm": 327.6996154785156, |
|
"kl": 0.364447561558336, |
|
"learning_rate": 9.302885579019626e-07, |
|
"loss": 0.0004, |
|
"reward": 6.55630399286747, |
|
"reward_std": 1.1096585169434547, |
|
"rewards/concensus_correctness_reward_func": 1.981812495738268, |
|
"rewards/consensus_reward_func": 1.8125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.3125, |
|
"rewards/question_recreation_reward_func": 0.8752728328108788, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4375, |
|
"rewards/xmlcount_reward_func": 1.13671875, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 522.21875, |
|
"epoch": 1.2, |
|
"grad_norm": 47.249237060546875, |
|
"kl": 211.27693609474227, |
|
"learning_rate": 9.218165262080022e-07, |
|
"loss": 0.2113, |
|
"reward": 3.5790372733026743, |
|
"reward_std": 1.652490053035656, |
|
"rewards/concensus_correctness_reward_func": 0.9664374999701977, |
|
"rewards/consensus_reward_func": 0.9375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.1875, |
|
"rewards/question_recreation_reward_func": 0.5032248190109385, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.234375, |
|
"rewards/xmlcount_reward_func": 0.75, |
|
"step": 42 |
|
}, |
|
{ |
|
"completion_length": 399.5, |
|
"epoch": 1.2571428571428571, |
|
"grad_norm": 46957.8125, |
|
"kl": 1240.154023682233, |
|
"learning_rate": 9.129020672271281e-07, |
|
"loss": 1.2402, |
|
"reward": 4.165122143924236, |
|
"reward_std": 2.677976368338932, |
|
"rewards/concensus_correctness_reward_func": 1.1625000014901161, |
|
"rewards/consensus_reward_func": 1.1875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0625, |
|
"rewards/question_recreation_reward_func": 0.6002783491458104, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.296875, |
|
"rewards/xmlcount_reward_func": 0.85546875, |
|
"step": 44 |
|
}, |
|
{ |
|
"completion_length": 348.71875, |
|
"epoch": 1.3142857142857143, |
|
"grad_norm": 197.34835815429688, |
|
"kl": 1697.7357009318657, |
|
"learning_rate": 9.035545309958046e-07, |
|
"loss": 1.6977, |
|
"reward": 5.216266397386789, |
|
"reward_std": 2.5298230523912935, |
|
"rewards/concensus_correctness_reward_func": 1.415687508881092, |
|
"rewards/consensus_reward_func": 1.375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.375, |
|
"rewards/question_recreation_reward_func": 0.7302663810260128, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.328125, |
|
"rewards/xmlcount_reward_func": 0.9921875, |
|
"step": 46 |
|
}, |
|
{ |
|
"completion_length": 427.28125, |
|
"epoch": 1.3714285714285714, |
|
"grad_norm": 107.24671936035156, |
|
"kl": 0.5033836024813354, |
|
"learning_rate": 8.937837217887272e-07, |
|
"loss": 0.0005, |
|
"reward": 3.593818176537752, |
|
"reward_std": 1.2321268621553827, |
|
"rewards/concensus_correctness_reward_func": 0.9172500036656857, |
|
"rewards/consensus_reward_func": 1.0, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0625, |
|
"rewards/question_recreation_reward_func": 0.5789119017135818, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.234375, |
|
"rewards/xmlcount_reward_func": 0.80078125, |
|
"step": 48 |
|
}, |
|
{ |
|
"completion_length": 406.6875, |
|
"epoch": 1.4285714285714286, |
|
"grad_norm": 2652.8984375, |
|
"kl": 21193.798785352148, |
|
"learning_rate": 8.83599887835493e-07, |
|
"loss": 21.1938, |
|
"reward": 3.9688764177262783, |
|
"reward_std": 3.281522080527793, |
|
"rewards/concensus_correctness_reward_func": 1.0411249957978725, |
|
"rewards/consensus_reward_func": 1.0, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.1875, |
|
"rewards/question_recreation_reward_func": 0.6308764494024217, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.25, |
|
"rewards/xmlcount_reward_func": 0.859375, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 456.0, |
|
"epoch": 1.4857142857142858, |
|
"grad_norm": 9026.6044921875, |
|
"kl": 1415.1057826047763, |
|
"learning_rate": 8.73013710571623e-07, |
|
"loss": 1.4151, |
|
"reward": 3.0648840237408876, |
|
"reward_std": 1.9786420244963665, |
|
"rewards/concensus_correctness_reward_func": 0.8255000002682209, |
|
"rewards/consensus_reward_func": 0.625, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.3125, |
|
"rewards/question_recreation_reward_func": 0.45813402088242583, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.15625, |
|
"rewards/xmlcount_reward_func": 0.6875, |
|
"step": 52 |
|
}, |
|
{ |
|
"completion_length": 484.0, |
|
"epoch": 1.5428571428571427, |
|
"grad_norm": 139.81741333007812, |
|
"kl": 0.6134039051830769, |
|
"learning_rate": 8.620362934352108e-07, |
|
"loss": 0.0006, |
|
"reward": 4.179860107600689, |
|
"reward_std": 1.8562917799558782, |
|
"rewards/concensus_correctness_reward_func": 1.2471874989569187, |
|
"rewards/consensus_reward_func": 1.0, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.3125, |
|
"rewards/question_recreation_reward_func": 0.5654851646249881, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.25, |
|
"rewards/xmlcount_reward_func": 0.8046875, |
|
"step": 54 |
|
}, |
|
{ |
|
"completion_length": 408.9375, |
|
"epoch": 1.6, |
|
"grad_norm": 102.32249450683594, |
|
"kl": 3008801003.4072285, |
|
"learning_rate": 8.506791502209496e-07, |
|
"loss": 3008801.25, |
|
"reward": 3.813065191730857, |
|
"reward_std": 1.33644521248425, |
|
"rewards/concensus_correctness_reward_func": 1.0011250115931034, |
|
"rewards/consensus_reward_func": 0.875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.375, |
|
"rewards/question_recreation_reward_func": 0.5424089302105131, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.203125, |
|
"rewards/xmlcount_reward_func": 0.81640625, |
|
"step": 56 |
|
}, |
|
{ |
|
"completion_length": 376.25, |
|
"epoch": 1.657142857142857, |
|
"grad_norm": 24.86591911315918, |
|
"kl": 76.24003965221345, |
|
"learning_rate": 8.389541930037516e-07, |
|
"loss": 0.0762, |
|
"reward": 3.775482662022114, |
|
"reward_std": 1.995910257101059, |
|
"rewards/concensus_correctness_reward_func": 0.9166250079870224, |
|
"rewards/consensus_reward_func": 1.0, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.6908889040350914, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.25, |
|
"rewards/xmlcount_reward_func": 0.91796875, |
|
"step": 58 |
|
}, |
|
{ |
|
"completion_length": 184.46875, |
|
"epoch": 1.7142857142857144, |
|
"grad_norm": 748.4888916015625, |
|
"kl": 16.552064943592995, |
|
"learning_rate": 8.268737196446263e-07, |
|
"loss": 0.0166, |
|
"reward": 5.196034669876099, |
|
"reward_std": 2.2851073294878006, |
|
"rewards/concensus_correctness_reward_func": 1.2321875053457916, |
|
"rewards/consensus_reward_func": 1.1875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.5, |
|
"rewards/question_recreation_reward_func": 0.8818159140646458, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.296875, |
|
"rewards/xmlcount_reward_func": 1.09765625, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 354.375, |
|
"epoch": 1.7714285714285714, |
|
"grad_norm": 11774.6318359375, |
|
"kl": 594.4727419780102, |
|
"learning_rate": 8.144504008919222e-07, |
|
"loss": 0.5945, |
|
"reward": 5.226467318832874, |
|
"reward_std": 1.8939718978672317, |
|
"rewards/concensus_correctness_reward_func": 1.5290624983608723, |
|
"rewards/consensus_reward_func": 1.3125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.375, |
|
"rewards/question_recreation_reward_func": 0.7208423566626152, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.328125, |
|
"rewards/xmlcount_reward_func": 0.9609375, |
|
"step": 62 |
|
}, |
|
{ |
|
"completion_length": 239.03125, |
|
"epoch": 1.8285714285714287, |
|
"grad_norm": 63723.67578125, |
|
"kl": 2681.7918725676136, |
|
"learning_rate": 8.016972670914623e-07, |
|
"loss": 2.6818, |
|
"reward": 5.760847687721252, |
|
"reward_std": 1.1207159195910208, |
|
"rewards/concensus_correctness_reward_func": 1.6048124991357327, |
|
"rewards/consensus_reward_func": 1.75, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.8435351252555847, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4375, |
|
"rewards/xmlcount_reward_func": 1.125, |
|
"step": 64 |
|
}, |
|
{ |
|
"completion_length": 166.84375, |
|
"epoch": 1.8857142857142857, |
|
"grad_norm": 0.016271423548460007, |
|
"kl": 0.16761540318839252, |
|
"learning_rate": 7.886276945195097e-07, |
|
"loss": 0.0002, |
|
"reward": 6.907515615224838, |
|
"reward_std": 0.27857801198842935, |
|
"rewards/concensus_correctness_reward_func": 2.0479375049471855, |
|
"rewards/consensus_reward_func": 1.9375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.25, |
|
"rewards/question_recreation_reward_func": 0.9689531102776527, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.484375, |
|
"rewards/xmlcount_reward_func": 1.21875, |
|
"step": 66 |
|
}, |
|
{ |
|
"completion_length": 166.0625, |
|
"epoch": 1.9428571428571428, |
|
"grad_norm": 1.8111671209335327, |
|
"kl": 0.17472450132481754, |
|
"learning_rate": 7.752553913529018e-07, |
|
"loss": 0.0002, |
|
"reward": 7.087520241737366, |
|
"reward_std": 0.6711924958362943, |
|
"rewards/concensus_correctness_reward_func": 2.3121249973773956, |
|
"rewards/consensus_reward_func": 2.0, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.5, |
|
"rewards/question_recreation_reward_func": 0.9998952522873878, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.484375, |
|
"rewards/xmlcount_reward_func": 0.7911249995231628, |
|
"step": 68 |
|
}, |
|
{ |
|
"completion_length": 155.53125, |
|
"epoch": 2.0, |
|
"grad_norm": 0.02920331247150898, |
|
"kl": 0.18397854757495224, |
|
"learning_rate": 7.61594383291065e-07, |
|
"loss": 0.0002, |
|
"reward": 6.990590900182724, |
|
"reward_std": 0.46470576524734497, |
|
"rewards/concensus_correctness_reward_func": 2.11124999076128, |
|
"rewards/consensus_reward_func": 1.8125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.4375, |
|
"rewards/question_recreation_reward_func": 0.9691846631467342, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.453125, |
|
"rewards/xmlcount_reward_func": 1.20703125, |
|
"step": 70 |
|
}, |
|
{ |
|
"completion_length": 136.96875, |
|
"epoch": 2.057142857142857, |
|
"grad_norm": 0.012207652442157269, |
|
"kl": 0.12467939942143857, |
|
"learning_rate": 7.476589988449938e-07, |
|
"loss": 0.0001, |
|
"reward": 7.336249977350235, |
|
"reward_std": 0.0, |
|
"rewards/concensus_correctness_reward_func": 2.2112499997019768, |
|
"rewards/consensus_reward_func": 2.0, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.375, |
|
"rewards/question_recreation_reward_func": 1.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.5, |
|
"rewards/xmlcount_reward_func": 1.25, |
|
"step": 72 |
|
}, |
|
{ |
|
"completion_length": 140.25, |
|
"epoch": 2.1142857142857143, |
|
"grad_norm": 1.8709704875946045, |
|
"kl": 0.14828130067326128, |
|
"learning_rate": 7.334638543086203e-07, |
|
"loss": 0.0001, |
|
"reward": 7.094191342592239, |
|
"reward_std": 8.292392158182338e-05, |
|
"rewards/concensus_correctness_reward_func": 2.0942500084638596, |
|
"rewards/consensus_reward_func": 2.0, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.25, |
|
"rewards/question_recreation_reward_func": 0.9999413713812828, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.5, |
|
"rewards/xmlcount_reward_func": 1.25, |
|
"step": 74 |
|
}, |
|
{ |
|
"completion_length": 158.71875, |
|
"epoch": 2.1714285714285713, |
|
"grad_norm": 0.01471527200192213, |
|
"kl": 0.20862194756045938, |
|
"learning_rate": 7.190238384283412e-07, |
|
"loss": 0.0002, |
|
"reward": 6.877131998538971, |
|
"reward_std": 0.7933639287948608, |
|
"rewards/concensus_correctness_reward_func": 2.067374996840954, |
|
"rewards/consensus_reward_func": 1.9375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.4375, |
|
"rewards/question_recreation_reward_func": 0.9687882512807846, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.484375, |
|
"rewards/xmlcount_reward_func": 0.9815937429666519, |
|
"step": 76 |
|
}, |
|
{ |
|
"completion_length": 188.125, |
|
"epoch": 2.2285714285714286, |
|
"grad_norm": 1.460219144821167, |
|
"kl": 4.615653241518885, |
|
"learning_rate": 7.043540967867781e-07, |
|
"loss": 0.0046, |
|
"reward": 6.435562700033188, |
|
"reward_std": 0.9147683555056574, |
|
"rewards/concensus_correctness_reward_func": 1.9740000180900097, |
|
"rewards/consensus_reward_func": 1.875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.25, |
|
"rewards/question_recreation_reward_func": 0.9308439530432224, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.46875, |
|
"rewards/xmlcount_reward_func": 0.9369687438011169, |
|
"step": 78 |
|
}, |
|
{ |
|
"completion_length": 167.875, |
|
"epoch": 2.2857142857142856, |
|
"grad_norm": 5.986639976501465, |
|
"kl": 97.31889040162787, |
|
"learning_rate": 6.894700159171534e-07, |
|
"loss": 0.0973, |
|
"reward": 6.660285115242004, |
|
"reward_std": 0.38862577243708074, |
|
"rewards/concensus_correctness_reward_func": 1.9778125062584877, |
|
"rewards/consensus_reward_func": 1.8125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.25, |
|
"rewards/question_recreation_reward_func": 0.9363788738846779, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.46875, |
|
"rewards/xmlcount_reward_func": 1.21484375, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 132.59375, |
|
"epoch": 2.342857142857143, |
|
"grad_norm": 140.21267700195312, |
|
"kl": 11.938221657648683, |
|
"learning_rate": 6.743872071649411e-07, |
|
"loss": 0.0119, |
|
"reward": 7.314000904560089, |
|
"reward_std": 0.03995019569993019, |
|
"rewards/concensus_correctness_reward_func": 2.2172499895095825, |
|
"rewards/consensus_reward_func": 2.0, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.375, |
|
"rewards/question_recreation_reward_func": 0.9912821874022484, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.484375, |
|
"rewards/xmlcount_reward_func": 1.24609375, |
|
"step": 82 |
|
}, |
|
{ |
|
"completion_length": 177.125, |
|
"epoch": 2.4, |
|
"grad_norm": 0.013902968727052212, |
|
"kl": 1.2978905094787478, |
|
"learning_rate": 6.59121490313722e-07, |
|
"loss": 0.0013, |
|
"reward": 6.766894176602364, |
|
"reward_std": 0.4406161531805992, |
|
"rewards/concensus_correctness_reward_func": 2.1780624948441982, |
|
"rewards/consensus_reward_func": 1.75, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.5, |
|
"rewards/question_recreation_reward_func": 0.938206740480382, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.453125, |
|
"rewards/xmlcount_reward_func": 0.947500005364418, |
|
"step": 84 |
|
}, |
|
{ |
|
"completion_length": 130.09375, |
|
"epoch": 2.4571428571428573, |
|
"grad_norm": 9.800251960754395, |
|
"kl": 0.22617360670119524, |
|
"learning_rate": 6.436888769924141e-07, |
|
"loss": 0.0002, |
|
"reward": 7.254179358482361, |
|
"reward_std": 0.013411822263151407, |
|
"rewards/concensus_correctness_reward_func": 2.1394999995827675, |
|
"rewards/consensus_reward_func": 2.0, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.375, |
|
"rewards/question_recreation_reward_func": 0.9896793477237225, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.5, |
|
"rewards/xmlcount_reward_func": 1.25, |
|
"step": 86 |
|
}, |
|
{ |
|
"completion_length": 168.96875, |
|
"epoch": 2.5142857142857142, |
|
"grad_norm": 1.437393307685852, |
|
"kl": 0.18564400169998407, |
|
"learning_rate": 6.281055538812861e-07, |
|
"loss": 0.0002, |
|
"reward": 7.096096932888031, |
|
"reward_std": 0.3263694606721401, |
|
"rewards/concensus_correctness_reward_func": 2.0173750072717667, |
|
"rewards/consensus_reward_func": 1.9375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.5, |
|
"rewards/question_recreation_reward_func": 0.9615344516932964, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.46875, |
|
"rewards/xmlcount_reward_func": 1.2109375, |
|
"step": 88 |
|
}, |
|
{ |
|
"completion_length": 225.4375, |
|
"epoch": 2.571428571428571, |
|
"grad_norm": 143.41500854492188, |
|
"kl": 8.297026936896145, |
|
"learning_rate": 6.123878657343647e-07, |
|
"loss": 0.0083, |
|
"reward": 6.617781460285187, |
|
"reward_std": 1.0305635929107666, |
|
"rewards/concensus_correctness_reward_func": 1.9847500026226044, |
|
"rewards/consensus_reward_func": 1.8125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.3125, |
|
"rewards/question_recreation_reward_func": 0.9064690098166466, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.453125, |
|
"rewards/xmlcount_reward_func": 1.1484375, |
|
"step": 90 |
|
}, |
|
{ |
|
"completion_length": 155.125, |
|
"epoch": 2.6285714285714286, |
|
"grad_norm": 29.54805564880371, |
|
"kl": 1.2011672258377075, |
|
"learning_rate": 5.96552298236044e-07, |
|
"loss": 0.0012, |
|
"reward": 7.490390375256538, |
|
"reward_std": 0.510272353887558, |
|
"rewards/concensus_correctness_reward_func": 2.4165625013411045, |
|
"rewards/consensus_reward_func": 1.9375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.5625, |
|
"rewards/question_recreation_reward_func": 0.9136716090142727, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.453125, |
|
"rewards/xmlcount_reward_func": 1.20703125, |
|
"step": 92 |
|
}, |
|
{ |
|
"completion_length": 183.28125, |
|
"epoch": 2.685714285714286, |
|
"grad_norm": 260070128.0, |
|
"kl": 8012920.115434824, |
|
"learning_rate": 5.806154607098799e-07, |
|
"loss": 8012.9199, |
|
"reward": 6.427686184644699, |
|
"reward_std": 0.5588814318180084, |
|
"rewards/concensus_correctness_reward_func": 1.8334374986588955, |
|
"rewards/consensus_reward_func": 1.875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.125, |
|
"rewards/question_recreation_reward_func": 0.9379986636340618, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.46875, |
|
"rewards/xmlcount_reward_func": 1.1875, |
|
"step": 94 |
|
}, |
|
{ |
|
"completion_length": 161.90625, |
|
"epoch": 2.742857142857143, |
|
"grad_norm": 165.2026824951172, |
|
"kl": 1.9255495527759194, |
|
"learning_rate": 5.645940686977032e-07, |
|
"loss": 0.0019, |
|
"reward": 6.643823355436325, |
|
"reward_std": 0.2799105942249298, |
|
"rewards/concensus_correctness_reward_func": 1.9093125015497208, |
|
"rewards/consensus_reward_func": 1.9375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.125, |
|
"rewards/question_recreation_reward_func": 0.9688858352601528, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.484375, |
|
"rewards/xmlcount_reward_func": 1.21875, |
|
"step": 96 |
|
}, |
|
{ |
|
"completion_length": 216.78125, |
|
"epoch": 2.8, |
|
"grad_norm": 15576.0224609375, |
|
"kl": 3184225.6172290286, |
|
"learning_rate": 5.485049264273241e-07, |
|
"loss": 3184.2256, |
|
"reward": 6.616672560572624, |
|
"reward_std": 1.0126504600048065, |
|
"rewards/concensus_correctness_reward_func": 1.9756249897181988, |
|
"rewards/consensus_reward_func": 1.8125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.3125, |
|
"rewards/question_recreation_reward_func": 0.9066725894808769, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.453125, |
|
"rewards/xmlcount_reward_func": 1.15625, |
|
"step": 98 |
|
}, |
|
{ |
|
"completion_length": 155.625, |
|
"epoch": 2.857142857142857, |
|
"grad_norm": 43.54243850708008, |
|
"kl": 17.29876364581287, |
|
"learning_rate": 5.323649091872178e-07, |
|
"loss": 0.0173, |
|
"reward": 7.1484761238098145, |
|
"reward_std": 0.29683050513267517, |
|
"rewards/concensus_correctness_reward_func": 2.252500005066395, |
|
"rewards/consensus_reward_func": 1.8125, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.5, |
|
"rewards/question_recreation_reward_func": 0.9389448185684159, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4375, |
|
"rewards/xmlcount_reward_func": 1.20703125, |
|
"step": 100 |
|
}, |
|
{ |
|
"completion_length": 196.84375, |
|
"epoch": 2.914285714285714, |
|
"grad_norm": 1101.1700439453125, |
|
"kl": 313.8222270826809, |
|
"learning_rate": 5.16190945626678e-07, |
|
"loss": 0.3138, |
|
"reward": 6.691585049033165, |
|
"reward_std": 0.5840071098791668, |
|
"rewards/concensus_correctness_reward_func": 1.989124983549118, |
|
"rewards/consensus_reward_func": 1.875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.25, |
|
"rewards/question_recreation_reward_func": 0.9368351008743048, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.453125, |
|
"rewards/xmlcount_reward_func": 1.1875, |
|
"step": 102 |
|
}, |
|
{ |
|
"completion_length": 153.96875, |
|
"epoch": 2.9714285714285715, |
|
"grad_norm": 1.4117786884307861, |
|
"kl": 0.2296100074891001, |
|
"learning_rate": 5e-07, |
|
"loss": 0.0002, |
|
"reward": 7.403991624712944, |
|
"reward_std": 0.28745076060295105, |
|
"rewards/concensus_correctness_reward_func": 2.2985000126063824, |
|
"rewards/consensus_reward_func": 1.9375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.5, |
|
"rewards/question_recreation_reward_func": 0.9687728695571423, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.484375, |
|
"rewards/xmlcount_reward_func": 1.21484375, |
|
"step": 104 |
|
}, |
|
{ |
|
"completion_length": 154.875, |
|
"epoch": 3.0285714285714285, |
|
"grad_norm": 0.05570273473858833, |
|
"kl": 295.5748745780438, |
|
"learning_rate": 4.838090543733221e-07, |
|
"loss": 0.2956, |
|
"reward": 7.4088806957006454, |
|
"reward_std": 0.2800062551832525, |
|
"rewards/concensus_correctness_reward_func": 2.299625001847744, |
|
"rewards/consensus_reward_func": 1.9375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.5, |
|
"rewards/question_recreation_reward_func": 0.9686306864023209, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.484375, |
|
"rewards/xmlcount_reward_func": 1.21875, |
|
"step": 106 |
|
}, |
|
{ |
|
"completion_length": 164.65625, |
|
"epoch": 3.085714285714286, |
|
"grad_norm": 0.04709651321172714, |
|
"kl": 0.1969663049094379, |
|
"learning_rate": 4.676350908127821e-07, |
|
"loss": 0.0002, |
|
"reward": 7.149936303496361, |
|
"reward_std": 0.28781798481941223, |
|
"rewards/concensus_correctness_reward_func": 2.1696874983608723, |
|
"rewards/consensus_reward_func": 1.9375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.375, |
|
"rewards/question_recreation_reward_func": 0.9685300551354885, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.484375, |
|
"rewards/xmlcount_reward_func": 1.21484375, |
|
"step": 108 |
|
}, |
|
{ |
|
"completion_length": 139.78125, |
|
"epoch": 3.142857142857143, |
|
"grad_norm": 0.015770502388477325, |
|
"kl": 0.20569787896238267, |
|
"learning_rate": 4.5149507357267597e-07, |
|
"loss": 0.0002, |
|
"reward": 7.575696915388107, |
|
"reward_std": 0.0, |
|
"rewards/concensus_correctness_reward_func": 2.3559999987483025, |
|
"rewards/consensus_reward_func": 2.0, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.5, |
|
"rewards/question_recreation_reward_func": 0.969696968793869, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.5, |
|
"rewards/xmlcount_reward_func": 1.25, |
|
"step": 110 |
|
}, |
|
{ |
|
"completion_length": 157.21875, |
|
"epoch": 3.2, |
|
"grad_norm": 50861.9453125, |
|
"kl": 6200.607699844055, |
|
"learning_rate": 4.354059313022969e-07, |
|
"loss": 6.2006, |
|
"reward": 7.015130370855331, |
|
"reward_std": 0.45689108967781067, |
|
"rewards/concensus_correctness_reward_func": 2.093562498688698, |
|
"rewards/consensus_reward_func": 1.9375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.3125, |
|
"rewards/question_recreation_reward_func": 0.9684428572654724, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.484375, |
|
"rewards/xmlcount_reward_func": 1.21875, |
|
"step": 112 |
|
}, |
|
{ |
|
"completion_length": 164.0, |
|
"epoch": 3.257142857142857, |
|
"grad_norm": 0.013901927508413792, |
|
"kl": 6807.066189021803, |
|
"learning_rate": 4.193845392901201e-07, |
|
"loss": 6.8071, |
|
"reward": 6.620099663734436, |
|
"reward_std": 0.3450772762298584, |
|
"rewards/concensus_correctness_reward_func": 2.0902499929070473, |
|
"rewards/consensus_reward_func": 2.0, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.125, |
|
"rewards/question_recreation_reward_func": 0.937474632402882, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.46875, |
|
"rewards/xmlcount_reward_func": 0.9986249953508377, |
|
"step": 114 |
|
}, |
|
{ |
|
"completion_length": 140.21875, |
|
"epoch": 3.314285714285714, |
|
"grad_norm": 0.050351180136203766, |
|
"kl": 0.195719227893278, |
|
"learning_rate": 4.0344770176395606e-07, |
|
"loss": 0.0002, |
|
"reward": 7.224750071763992, |
|
"reward_std": 0.0, |
|
"rewards/concensus_correctness_reward_func": 2.224750004708767, |
|
"rewards/consensus_reward_func": 1.875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.375, |
|
"rewards/question_recreation_reward_func": 1.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.5, |
|
"rewards/xmlcount_reward_func": 1.25, |
|
"step": 116 |
|
}, |
|
{ |
|
"completion_length": 163.25, |
|
"epoch": 3.3714285714285714, |
|
"grad_norm": 21.72372817993164, |
|
"kl": 2.1254541873931885, |
|
"learning_rate": 3.8761213426563543e-07, |
|
"loss": 0.0021, |
|
"reward": 7.039299890398979, |
|
"reward_std": 0.43868909776210785, |
|
"rewards/concensus_correctness_reward_func": 2.16687498614192, |
|
"rewards/consensus_reward_func": 1.9375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.3125, |
|
"rewards/question_recreation_reward_func": 0.9388312064111233, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.46875, |
|
"rewards/xmlcount_reward_func": 1.21484375, |
|
"step": 118 |
|
}, |
|
{ |
|
"completion_length": 159.15625, |
|
"epoch": 3.4285714285714284, |
|
"grad_norm": 0.3591388761997223, |
|
"kl": 6.059129260480404, |
|
"learning_rate": 3.718944461187138e-07, |
|
"loss": 0.0061, |
|
"reward": 6.463347539305687, |
|
"reward_std": 0.3400730788707733, |
|
"rewards/concensus_correctness_reward_func": 1.9808750078082085, |
|
"rewards/consensus_reward_func": 1.875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.25, |
|
"rewards/question_recreation_reward_func": 0.9365975013934076, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.46875, |
|
"rewards/xmlcount_reward_func": 0.9521249979734421, |
|
"step": 120 |
|
}, |
|
{ |
|
"completion_length": 123.65625, |
|
"epoch": 3.4857142857142858, |
|
"grad_norm": 194.1432647705078, |
|
"kl": 57.89645641669631, |
|
"learning_rate": 3.563111230075859e-07, |
|
"loss": 0.0579, |
|
"reward": 7.029898107051849, |
|
"reward_std": 0.3636358277872205, |
|
"rewards/concensus_correctness_reward_func": 2.115187507122755, |
|
"rewards/consensus_reward_func": 1.9375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.4375, |
|
"rewards/question_recreation_reward_func": 0.914116925559938, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4375, |
|
"rewards/xmlcount_reward_func": 1.18809375166893, |
|
"step": 122 |
|
}, |
|
{ |
|
"completion_length": 148.8125, |
|
"epoch": 3.5428571428571427, |
|
"grad_norm": 14.622519493103027, |
|
"kl": 0.3278482835739851, |
|
"learning_rate": 3.408785096862782e-07, |
|
"loss": 0.0003, |
|
"reward": 6.669890329241753, |
|
"reward_std": 0.951744182035327, |
|
"rewards/concensus_correctness_reward_func": 2.159937519580126, |
|
"rewards/consensus_reward_func": 1.9375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.375, |
|
"rewards/question_recreation_reward_func": 0.9516402631998062, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.46875, |
|
"rewards/xmlcount_reward_func": 0.7770625054836273, |
|
"step": 124 |
|
}, |
|
{ |
|
"completion_length": 138.34375, |
|
"epoch": 3.6, |
|
"grad_norm": 3.0443975925445557, |
|
"kl": 278.0337795561645, |
|
"learning_rate": 3.2561279283505884e-07, |
|
"loss": 0.278, |
|
"reward": 7.347518771886826, |
|
"reward_std": 0.10661890726260026, |
|
"rewards/concensus_correctness_reward_func": 2.1738749966025352, |
|
"rewards/consensus_reward_func": 2.0, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.5, |
|
"rewards/question_recreation_reward_func": 0.9431750550866127, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.484375, |
|
"rewards/xmlcount_reward_func": 1.24609375, |
|
"step": 126 |
|
}, |
|
{ |
|
"completion_length": 133.65625, |
|
"epoch": 3.657142857142857, |
|
"grad_norm": 63.743953704833984, |
|
"kl": 141.1469784581568, |
|
"learning_rate": 3.105299840828466e-07, |
|
"loss": 0.1411, |
|
"reward": 6.800521522760391, |
|
"reward_std": 0.010683320462703705, |
|
"rewards/concensus_correctness_reward_func": 1.9629999995231628, |
|
"rewards/consensus_reward_func": 2.0, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.125, |
|
"rewards/question_recreation_reward_func": 0.9625215027481318, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.5, |
|
"rewards/xmlcount_reward_func": 1.25, |
|
"step": 128 |
|
}, |
|
{ |
|
"completion_length": 136.46875, |
|
"epoch": 3.7142857142857144, |
|
"grad_norm": 8.97818660736084, |
|
"kl": 0.5335320448502898, |
|
"learning_rate": 2.95645903213222e-07, |
|
"loss": 0.0005, |
|
"reward": 7.278413146734238, |
|
"reward_std": 0.07083342224359512, |
|
"rewards/concensus_correctness_reward_func": 2.2035000026226044, |
|
"rewards/consensus_reward_func": 2.0, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.375, |
|
"rewards/question_recreation_reward_func": 0.9694444462656975, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.484375, |
|
"rewards/xmlcount_reward_func": 1.24609375, |
|
"step": 130 |
|
}, |
|
{ |
|
"completion_length": 129.1875, |
|
"epoch": 3.7714285714285714, |
|
"grad_norm": 2.273700714111328, |
|
"kl": 0.1980545329861343, |
|
"learning_rate": 2.8097616157165885e-07, |
|
"loss": 0.0002, |
|
"reward": 7.1327812522649765, |
|
"reward_std": 0.10708247870206833, |
|
"rewards/concensus_correctness_reward_func": 2.1523124910891056, |
|
"rewards/consensus_reward_func": 1.875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.375, |
|
"rewards/question_recreation_reward_func": 1.0, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.484375, |
|
"rewards/xmlcount_reward_func": 1.24609375, |
|
"step": 132 |
|
}, |
|
{ |
|
"completion_length": 136.46875, |
|
"epoch": 3.8285714285714287, |
|
"grad_norm": 3.4171736240386963, |
|
"kl": 0.1759730235207826, |
|
"learning_rate": 2.665361456913797e-07, |
|
"loss": 0.0002, |
|
"reward": 7.524241715669632, |
|
"reward_std": 0.012173316441476345, |
|
"rewards/concensus_correctness_reward_func": 2.2848750203847885, |
|
"rewards/consensus_reward_func": 2.0, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.5, |
|
"rewards/question_recreation_reward_func": 0.989366702735424, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.5, |
|
"rewards/xmlcount_reward_func": 1.25, |
|
"step": 134 |
|
}, |
|
{ |
|
"completion_length": 132.6875, |
|
"epoch": 3.8857142857142857, |
|
"grad_norm": 21.402355194091797, |
|
"kl": 0.2659228784032166, |
|
"learning_rate": 2.523410011550064e-07, |
|
"loss": 0.0003, |
|
"reward": 6.992657542228699, |
|
"reward_std": 0.04131975769996643, |
|
"rewards/concensus_correctness_reward_func": 2.021875023841858, |
|
"rewards/consensus_reward_func": 2.0, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.25, |
|
"rewards/question_recreation_reward_func": 0.9707825183868408, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.5, |
|
"rewards/xmlcount_reward_func": 1.25, |
|
"step": 136 |
|
}, |
|
{ |
|
"completion_length": 129.03125, |
|
"epoch": 3.942857142857143, |
|
"grad_norm": 132813709312.0, |
|
"kl": 6803573266.517654, |
|
"learning_rate": 2.3840561670893495e-07, |
|
"loss": 6803573.5, |
|
"reward": 6.834182530641556, |
|
"reward_std": 0.2304869929794222, |
|
"rewards/concensus_correctness_reward_func": 2.089499995112419, |
|
"rewards/consensus_reward_func": 2.0, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.25, |
|
"rewards/question_recreation_reward_func": 0.8875887226313353, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4375, |
|
"rewards/xmlcount_reward_func": 1.1695937439799309, |
|
"step": 138 |
|
}, |
|
{ |
|
"completion_length": 140.8125, |
|
"epoch": 4.0, |
|
"grad_norm": 0.022691868245601654, |
|
"kl": 26381.484041058226, |
|
"learning_rate": 2.247446086470982e-07, |
|
"loss": 26.3815, |
|
"reward": 7.248510301113129, |
|
"reward_std": 0.0460367277264595, |
|
"rewards/concensus_correctness_reward_func": 2.21637499332428, |
|
"rewards/consensus_reward_func": 1.875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.4375, |
|
"rewards/question_recreation_reward_func": 0.969635296612978, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.5, |
|
"rewards/xmlcount_reward_func": 1.25, |
|
"step": 140 |
|
}, |
|
{ |
|
"completion_length": 157.53125, |
|
"epoch": 4.057142857142857, |
|
"grad_norm": 0.009256873279809952, |
|
"kl": 3.527690098620951, |
|
"learning_rate": 2.113723054804904e-07, |
|
"loss": 0.0035, |
|
"reward": 7.401162892580032, |
|
"reward_std": 0.6326838135719299, |
|
"rewards/concensus_correctness_reward_func": 2.4743749871850014, |
|
"rewards/consensus_reward_func": 2.0, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.625, |
|
"rewards/question_recreation_reward_func": 0.9991629458963871, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.484375, |
|
"rewards/xmlcount_reward_func": 0.8182500004768372, |
|
"step": 142 |
|
}, |
|
{ |
|
"completion_length": 129.03125, |
|
"epoch": 4.114285714285714, |
|
"grad_norm": 7.365884304046631, |
|
"kl": 0.2767415994312614, |
|
"learning_rate": 1.9830273290853766e-07, |
|
"loss": 0.0003, |
|
"reward": 7.2931163012981415, |
|
"reward_std": 0.07107648908277042, |
|
"rewards/concensus_correctness_reward_func": 2.218375027179718, |
|
"rewards/consensus_reward_func": 2.0, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.375, |
|
"rewards/question_recreation_reward_func": 0.9692725799977779, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.484375, |
|
"rewards/xmlcount_reward_func": 1.24609375, |
|
"step": 144 |
|
}, |
|
{ |
|
"completion_length": 156.8125, |
|
"epoch": 4.171428571428572, |
|
"grad_norm": 4.549909591674805, |
|
"kl": 2098.400880107074, |
|
"learning_rate": 1.8554959910807772e-07, |
|
"loss": 2.0984, |
|
"reward": 6.6754628121852875, |
|
"reward_std": 0.5301313251256943, |
|
"rewards/concensus_correctness_reward_func": 2.098374992609024, |
|
"rewards/consensus_reward_func": 2.0, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.1875, |
|
"rewards/question_recreation_reward_func": 0.9207440502941608, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.46875, |
|
"rewards/xmlcount_reward_func": 1.0000937432050705, |
|
"step": 146 |
|
}, |
|
{ |
|
"completion_length": 161.1875, |
|
"epoch": 4.228571428571429, |
|
"grad_norm": 130.0556640625, |
|
"kl": 4.999799037585035, |
|
"learning_rate": 1.7312628035537386e-07, |
|
"loss": 0.005, |
|
"reward": 7.465287238359451, |
|
"reward_std": 0.6660420118496404, |
|
"rewards/concensus_correctness_reward_func": 2.3166875019669533, |
|
"rewards/consensus_reward_func": 1.9375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.625, |
|
"rewards/question_recreation_reward_func": 0.9381621927022934, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.453125, |
|
"rewards/xmlcount_reward_func": 1.194812498986721, |
|
"step": 148 |
|
}, |
|
{ |
|
"completion_length": 134.40625, |
|
"epoch": 4.285714285714286, |
|
"grad_norm": 8.668111801147461, |
|
"kl": 641.8768360905815, |
|
"learning_rate": 1.6104580699624837e-07, |
|
"loss": 0.6419, |
|
"reward": 7.1231569945812225, |
|
"reward_std": 0.0297078593284823, |
|
"rewards/concensus_correctness_reward_func": 2.342249996960163, |
|
"rewards/consensus_reward_func": 1.875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.375, |
|
"rewards/question_recreation_reward_func": 0.8770633104722947, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4375, |
|
"rewards/xmlcount_reward_func": 1.2163437493145466, |
|
"step": 150 |
|
}, |
|
{ |
|
"completion_length": 122.125, |
|
"epoch": 4.3428571428571425, |
|
"grad_norm": 6.391092777252197, |
|
"kl": 0.20541435782797635, |
|
"learning_rate": 1.493208497790504e-07, |
|
"loss": 0.0002, |
|
"reward": 6.966011185199022, |
|
"reward_std": 0.00430500041693449, |
|
"rewards/concensus_correctness_reward_func": 2.106500007212162, |
|
"rewards/consensus_reward_func": 1.875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.375, |
|
"rewards/question_recreation_reward_func": 0.9532612152397633, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.46875, |
|
"rewards/xmlcount_reward_func": 1.1875, |
|
"step": 152 |
|
}, |
|
{ |
|
"completion_length": 127.3125, |
|
"epoch": 4.4, |
|
"grad_norm": 43.81622314453125, |
|
"kl": 5.563992372946814, |
|
"learning_rate": 1.3796370656478934e-07, |
|
"loss": 0.0056, |
|
"reward": 6.806715875864029, |
|
"reward_std": 0.06279254704713821, |
|
"rewards/concensus_correctness_reward_func": 1.9822500199079514, |
|
"rewards/consensus_reward_func": 2.0, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.125, |
|
"rewards/question_recreation_reward_func": 0.9689970314502716, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.484375, |
|
"rewards/xmlcount_reward_func": 1.24609375, |
|
"step": 154 |
|
}, |
|
{ |
|
"completion_length": 130.46875, |
|
"epoch": 4.457142857142857, |
|
"grad_norm": 122.86102294921875, |
|
"kl": 20.539323112927377, |
|
"learning_rate": 1.2698628942837697e-07, |
|
"loss": 0.0205, |
|
"reward": 6.735322088003159, |
|
"reward_std": 0.23215299472212791, |
|
"rewards/concensus_correctness_reward_func": 2.086000010371208, |
|
"rewards/consensus_reward_func": 1.875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.1875, |
|
"rewards/question_recreation_reward_func": 0.896103395964019, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.46875, |
|
"rewards/xmlcount_reward_func": 1.2219687476754189, |
|
"step": 156 |
|
}, |
|
{ |
|
"completion_length": 132.90625, |
|
"epoch": 4.514285714285714, |
|
"grad_norm": 69.87224578857422, |
|
"kl": 0.6354921485763043, |
|
"learning_rate": 1.1640011216450691e-07, |
|
"loss": 0.0006, |
|
"reward": 7.233689934015274, |
|
"reward_std": 0.514858566224575, |
|
"rewards/concensus_correctness_reward_func": 2.2895624935626984, |
|
"rewards/consensus_reward_func": 1.9375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.5, |
|
"rewards/question_recreation_reward_func": 0.8818775303661823, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4375, |
|
"rewards/xmlcount_reward_func": 1.1872499994933605, |
|
"step": 158 |
|
}, |
|
{ |
|
"completion_length": 133.59375, |
|
"epoch": 4.571428571428571, |
|
"grad_norm": 8.249082565307617, |
|
"kl": 0.26116269128397107, |
|
"learning_rate": 1.0621627821127288e-07, |
|
"loss": 0.0003, |
|
"reward": 7.30239263176918, |
|
"reward_std": 0.0416584275662899, |
|
"rewards/concensus_correctness_reward_func": 2.208875000476837, |
|
"rewards/consensus_reward_func": 2.0, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.375, |
|
"rewards/question_recreation_reward_func": 0.9685175716876984, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.5, |
|
"rewards/xmlcount_reward_func": 1.25, |
|
"step": 160 |
|
}, |
|
{ |
|
"completion_length": 136.1875, |
|
"epoch": 4.628571428571428, |
|
"grad_norm": 0.024719232693314552, |
|
"kl": 0.7720870058983564, |
|
"learning_rate": 9.644546900419531e-08, |
|
"loss": 0.0008, |
|
"reward": 6.74112144112587, |
|
"reward_std": 0.14779043197631836, |
|
"rewards/concensus_correctness_reward_func": 1.97062499076128, |
|
"rewards/consensus_reward_func": 2.0, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.125, |
|
"rewards/question_recreation_reward_func": 0.9422151371836662, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.484375, |
|
"rewards/xmlcount_reward_func": 1.2189062498509884, |
|
"step": 162 |
|
}, |
|
{ |
|
"completion_length": 132.28125, |
|
"epoch": 4.685714285714286, |
|
"grad_norm": 99.90188598632812, |
|
"kl": 5.756765312515199, |
|
"learning_rate": 8.70979327728718e-08, |
|
"loss": 0.0058, |
|
"reward": 7.076995253562927, |
|
"reward_std": 0.3961862847208977, |
|
"rewards/concensus_correctness_reward_func": 2.1743750162422657, |
|
"rewards/consensus_reward_func": 1.9375, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.375, |
|
"rewards/question_recreation_reward_func": 0.9299640282988548, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.453125, |
|
"rewards/xmlcount_reward_func": 1.20703125, |
|
"step": 164 |
|
}, |
|
{ |
|
"completion_length": 133.46875, |
|
"epoch": 4.742857142857143, |
|
"grad_norm": 1942.1475830078125, |
|
"kl": 80.78691061586142, |
|
"learning_rate": 7.81834737919978e-08, |
|
"loss": 0.0808, |
|
"reward": 7.304515153169632, |
|
"reward_std": 0.11349444479128579, |
|
"rewards/concensus_correctness_reward_func": 2.350000001490116, |
|
"rewards/consensus_reward_func": 2.0, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.375, |
|
"rewards/question_recreation_reward_func": 0.8892338592559099, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.46875, |
|
"rewards/xmlcount_reward_func": 1.2215312495827675, |
|
"step": 166 |
|
}, |
|
{ |
|
"completion_length": 131.75, |
|
"epoch": 4.8, |
|
"grad_norm": 36.11498260498047, |
|
"kl": 35.66279458301142, |
|
"learning_rate": 6.971144209803736e-08, |
|
"loss": 0.0357, |
|
"reward": 7.376064032316208, |
|
"reward_std": 0.1673837215421372, |
|
"rewards/concensus_correctness_reward_func": 2.3438749983906746, |
|
"rewards/consensus_reward_func": 2.0, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.5, |
|
"rewards/question_recreation_reward_func": 0.8862515506334603, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4375, |
|
"rewards/xmlcount_reward_func": 1.2084374986588955, |
|
"step": 168 |
|
}, |
|
{ |
|
"completion_length": 159.9375, |
|
"epoch": 4.857142857142857, |
|
"grad_norm": 1420.3355712890625, |
|
"kl": 5809.957044942072, |
|
"learning_rate": 6.16907236823262e-08, |
|
"loss": 5.81, |
|
"reward": 6.121835008263588, |
|
"reward_std": 0.37124670308548957, |
|
"rewards/concensus_correctness_reward_func": 1.8349375016987324, |
|
"rewards/consensus_reward_func": 1.875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0625, |
|
"rewards/question_recreation_reward_func": 0.7878349621314555, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.40625, |
|
"rewards/xmlcount_reward_func": 1.1553125008940697, |
|
"step": 170 |
|
}, |
|
{ |
|
"completion_length": 139.1875, |
|
"epoch": 4.914285714285715, |
|
"grad_norm": 10.688383102416992, |
|
"kl": 0.22011687979102135, |
|
"learning_rate": 5.412973117089287e-08, |
|
"loss": 0.0002, |
|
"reward": 7.198920458555222, |
|
"reward_std": 0.20292456448078156, |
|
"rewards/concensus_correctness_reward_func": 2.0933750048279762, |
|
"rewards/consensus_reward_func": 2.0, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.4375, |
|
"rewards/question_recreation_reward_func": 0.9375766552984715, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.484375, |
|
"rewards/xmlcount_reward_func": 1.24609375, |
|
"step": 172 |
|
}, |
|
{ |
|
"completion_length": 136.0, |
|
"epoch": 4.9714285714285715, |
|
"grad_norm": 11.03411865234375, |
|
"kl": 1.326516842469573, |
|
"learning_rate": 4.703639500077655e-08, |
|
"loss": 0.0013, |
|
"reward": 6.875307530164719, |
|
"reward_std": 0.13279777020215988, |
|
"rewards/concensus_correctness_reward_func": 2.067124992609024, |
|
"rewards/consensus_reward_func": 2.0, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.25, |
|
"rewards/question_recreation_reward_func": 0.9099949998781085, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.453125, |
|
"rewards/xmlcount_reward_func": 1.1950624994933605, |
|
"step": 174 |
|
}, |
|
{ |
|
"completion_length": 137.59375, |
|
"epoch": 5.0285714285714285, |
|
"grad_norm": 130.2829132080078, |
|
"kl": 10.879363138461486, |
|
"learning_rate": 4.041815510209395e-08, |
|
"loss": 0.0109, |
|
"reward": 6.4715642631053925, |
|
"reward_std": 0.063371941447258, |
|
"rewards/concensus_correctness_reward_func": 1.7663749903440475, |
|
"rewards/consensus_reward_func": 2.0, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.0, |
|
"rewards/question_recreation_reward_func": 0.9747205302119255, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.484375, |
|
"rewards/xmlcount_reward_func": 1.24609375, |
|
"step": 176 |
|
}, |
|
{ |
|
"completion_length": 133.40625, |
|
"epoch": 5.085714285714285, |
|
"grad_norm": 34.99993133544922, |
|
"kl": 2.79548569873441, |
|
"learning_rate": 3.4281953094578875e-08, |
|
"loss": 0.0028, |
|
"reward": 7.421046018600464, |
|
"reward_std": 0.10142664304294158, |
|
"rewards/concensus_correctness_reward_func": 2.3401249796152115, |
|
"rewards/consensus_reward_func": 2.0, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.5, |
|
"rewards/question_recreation_reward_func": 0.9102647739928216, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.453125, |
|
"rewards/xmlcount_reward_func": 1.2175312489271164, |
|
"step": 178 |
|
}, |
|
{ |
|
"completion_length": 167.1875, |
|
"epoch": 5.142857142857143, |
|
"grad_norm": 3914.157470703125, |
|
"kl": 408.69288858864456, |
|
"learning_rate": 2.8634225006782864e-08, |
|
"loss": 0.4087, |
|
"reward": 6.723982572555542, |
|
"reward_std": 0.15523457527160645, |
|
"rewards/concensus_correctness_reward_func": 1.9587500020861626, |
|
"rewards/consensus_reward_func": 2.0, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.125, |
|
"rewards/question_recreation_reward_func": 0.9449200928211212, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.46875, |
|
"rewards/xmlcount_reward_func": 1.2265625, |
|
"step": 180 |
|
}, |
|
{ |
|
"completion_length": 128.625, |
|
"epoch": 5.2, |
|
"grad_norm": 40.13917922973633, |
|
"kl": 16038.165946810506, |
|
"learning_rate": 2.348089452556956e-08, |
|
"loss": 16.0382, |
|
"reward": 6.769445240497589, |
|
"reward_std": 0.5420721787959337, |
|
"rewards/concensus_correctness_reward_func": 2.2128750011324883, |
|
"rewards/consensus_reward_func": 1.875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.25, |
|
"rewards/question_recreation_reward_func": 0.8193827569484711, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.421875, |
|
"rewards/xmlcount_reward_func": 1.190312497317791, |
|
"step": 182 |
|
}, |
|
{ |
|
"completion_length": 135.5625, |
|
"epoch": 5.257142857142857, |
|
"grad_norm": 0.020569220185279846, |
|
"kl": 0.19448763993568718, |
|
"learning_rate": 1.882736678298491e-08, |
|
"loss": 0.0002, |
|
"reward": 7.055787056684494, |
|
"reward_std": 0.03314562886953354, |
|
"rewards/concensus_correctness_reward_func": 2.081250011920929, |
|
"rewards/consensus_reward_func": 2.0, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.25, |
|
"rewards/question_recreation_reward_func": 0.9979745373129845, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.484375, |
|
"rewards/xmlcount_reward_func": 1.2421875, |
|
"step": 184 |
|
}, |
|
{ |
|
"completion_length": 156.5, |
|
"epoch": 5.314285714285714, |
|
"grad_norm": 19.919050216674805, |
|
"kl": 14.24597706948407, |
|
"learning_rate": 1.4678522687020412e-08, |
|
"loss": 0.0142, |
|
"reward": 6.925608813762665, |
|
"reward_std": 0.1697275247424841, |
|
"rewards/concensus_correctness_reward_func": 2.210874982178211, |
|
"rewards/consensus_reward_func": 2.0, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.25, |
|
"rewards/question_recreation_reward_func": 0.8500463847303763, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4375, |
|
"rewards/xmlcount_reward_func": 1.1771874986588955, |
|
"step": 186 |
|
}, |
|
{ |
|
"completion_length": 128.25, |
|
"epoch": 5.371428571428572, |
|
"grad_norm": 677.8944702148438, |
|
"kl": 176.0574713665992, |
|
"learning_rate": 1.1038713802214717e-08, |
|
"loss": 0.1761, |
|
"reward": 6.819926559925079, |
|
"reward_std": 0.1877051831688732, |
|
"rewards/concensus_correctness_reward_func": 2.0945000126957893, |
|
"rewards/consensus_reward_func": 2.0, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.25, |
|
"rewards/question_recreation_reward_func": 0.7840202623046935, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.453125, |
|
"rewards/xmlcount_reward_func": 1.23828125, |
|
"step": 188 |
|
}, |
|
{ |
|
"completion_length": 128.46875, |
|
"epoch": 5.428571428571429, |
|
"grad_norm": 0.06580457091331482, |
|
"kl": 0.8888655919581652, |
|
"learning_rate": 7.91175778546288e-09, |
|
"loss": 0.0009, |
|
"reward": 7.360000848770142, |
|
"reward_std": 0.15114279091358185, |
|
"rewards/concensus_correctness_reward_func": 2.34187500923872, |
|
"rewards/consensus_reward_func": 1.875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.4375, |
|
"rewards/question_recreation_reward_func": 0.975157156586647, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.484375, |
|
"rewards/xmlcount_reward_func": 1.24609375, |
|
"step": 190 |
|
}, |
|
{ |
|
"completion_length": 133.9375, |
|
"epoch": 5.485714285714286, |
|
"grad_norm": 9619.09375, |
|
"kl": 433.19152829330415, |
|
"learning_rate": 5.3009343818219975e-09, |
|
"loss": 0.4332, |
|
"reward": 7.0666501224040985, |
|
"reward_std": 0.1772715449333191, |
|
"rewards/concensus_correctness_reward_func": 2.06700000166893, |
|
"rewards/consensus_reward_func": 2.0, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.3125, |
|
"rewards/question_recreation_reward_func": 0.9739313460886478, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.484375, |
|
"rewards/xmlcount_reward_func": 1.2288437485694885, |
|
"step": 192 |
|
}, |
|
{ |
|
"completion_length": 163.15625, |
|
"epoch": 5.542857142857143, |
|
"grad_norm": 17.19892120361328, |
|
"kl": 1.2293917203787714, |
|
"learning_rate": 3.2089819845111944e-09, |
|
"loss": 0.0012, |
|
"reward": 7.017405599355698, |
|
"reward_std": 0.27922892197966576, |
|
"rewards/concensus_correctness_reward_func": 2.2905624993145466, |
|
"rewards/consensus_reward_func": 1.875, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.3125, |
|
"rewards/question_recreation_reward_func": 0.9062805884168483, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.4375, |
|
"rewards/xmlcount_reward_func": 1.1955625005066395, |
|
"step": 194 |
|
}, |
|
{ |
|
"completion_length": 137.96875, |
|
"epoch": 5.6, |
|
"grad_norm": 0.01611829362809658, |
|
"kl": 0.16643174993805587, |
|
"learning_rate": 1.638094762715314e-09, |
|
"loss": 0.0002, |
|
"reward": 7.399079352617264, |
|
"reward_std": 0.0, |
|
"rewards/concensus_correctness_reward_func": 2.149375006556511, |
|
"rewards/consensus_reward_func": 2.0, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.5, |
|
"rewards/question_recreation_reward_func": 0.9997043535113335, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.5, |
|
"rewards/xmlcount_reward_func": 1.25, |
|
"step": 196 |
|
}, |
|
{ |
|
"completion_length": 158.21875, |
|
"epoch": 5.6571428571428575, |
|
"grad_norm": 2546.03564453125, |
|
"kl": 279.96135277603753, |
|
"learning_rate": 5.899203602046654e-10, |
|
"loss": 0.28, |
|
"reward": 6.826089903712273, |
|
"reward_std": 0.3347325325012207, |
|
"rewards/concensus_correctness_reward_func": 2.163374997675419, |
|
"rewards/consensus_reward_func": 2.0, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.25, |
|
"rewards/question_recreation_reward_func": 0.938058597035706, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.46875, |
|
"rewards/xmlcount_reward_func": 1.0059062540531158, |
|
"step": 198 |
|
}, |
|
{ |
|
"completion_length": 155.625, |
|
"epoch": 5.714285714285714, |
|
"grad_norm": 32.50419616699219, |
|
"kl": 3.371709798462689, |
|
"learning_rate": 6.555816718389895e-11, |
|
"loss": 0.0034, |
|
"reward": 7.258530080318451, |
|
"reward_std": 0.13536836579442024, |
|
"rewards/concensus_correctness_reward_func": 2.2292499989271164, |
|
"rewards/consensus_reward_func": 2.0, |
|
"rewards/cumulative_reward_2": 0.0, |
|
"rewards/final_correctness_reward_func": 0.375, |
|
"rewards/question_recreation_reward_func": 0.9394363649189472, |
|
"rewards/soft_format_reward_func": 0.0, |
|
"rewards/strict_format_reward_func": 0.484375, |
|
"rewards/xmlcount_reward_func": 1.23046875, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 5.714285714285714, |
|
"step": 200, |
|
"total_flos": 0.0, |
|
"train_loss": 98745.44226059376, |
|
"train_runtime": 1459.4013, |
|
"train_samples_per_second": 2.193, |
|
"train_steps_per_second": 0.137 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 200, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 6, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|