{ "best_global_step": 400, "best_metric": 1.7388446187973023, "best_model_checkpoint": "/mnt/development/ubuntu/ms-swift/examples/train/grpo/output/v56-20250402-121702/checkpoint-400", "epoch": 0.16161616161616163, "eval_steps": 50, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 1499.96875, "epoch": 0.00040404040404040404, "grad_norm": 0.13631828129291534, "kl": 0.0, "learning_rate": 8.064516129032259e-08, "loss": 0.0811297595500946, "memory(GiB)": 196.04, "response_clip_ratio": 0.09375, "reward": 1.3449952602386475, "reward_std": 0.3455343246459961, "rewards/CosineReward": 0.6062303781509399, "rewards/Format": 0.8125, "rewards/RepetitionPenalty": -0.07373504340648651, "step": 1, "train_speed(iter/s)": 0.007784 }, { "clip_ratio": 0.0, "completion_length": 1400.6180555555557, "epoch": 0.00404040404040404, "grad_norm": 0.08316703885793686, "kl": 0.0010664198133680555, "learning_rate": 8.064516129032258e-07, "loss": 0.08019257254070705, "memory(GiB)": 200.91, "response_clip_ratio": 0.07291666666666667, "reward": 1.3266024854448106, "reward_std": 0.37116087476412457, "rewards/CosineReward": 0.6330045991473727, "rewards/Format": 0.7430555555555556, "rewards/RepetitionPenalty": -0.04945767049988111, "step": 10, "train_speed(iter/s)": 0.012326 }, { "clip_ratio": 0.0, "completion_length": 1391.28125, "epoch": 0.00808080808080808, "grad_norm": 0.07319434732198715, "kl": 0.001186370849609375, "learning_rate": 1.6129032258064516e-06, "loss": 0.10411052703857422, "memory(GiB)": 201.03, "response_clip_ratio": 0.065625, "reward": 1.3246135115623474, "reward_std": 0.3957327276468277, "rewards/CosineReward": 0.6364030420780182, "rewards/Format": 0.740625, "rewards/RepetitionPenalty": -0.05241450294852257, "step": 20, "train_speed(iter/s)": 0.012834 }, { "clip_ratio": 0.0, "completion_length": 1413.8125, "epoch": 0.012121212121212121, "grad_norm": 0.07632259279489517, "kl": 0.00119476318359375, "learning_rate": 2.4193548387096776e-06, "loss": 0.1320902466773987, "memory(GiB)": 201.03, "response_clip_ratio": 0.078125, "reward": 1.3080031633377076, "reward_std": 0.4167481750249863, "rewards/CosineReward": 0.6317151665687561, "rewards/Format": 0.725, "rewards/RepetitionPenalty": -0.04871200248599052, "step": 30, "train_speed(iter/s)": 0.012994 }, { "clip_ratio": 0.0, "completion_length": 1422.8625, "epoch": 0.01616161616161616, "grad_norm": 0.06581436842679977, "kl": 0.00121917724609375, "learning_rate": 3.225806451612903e-06, "loss": 0.10100094079971314, "memory(GiB)": 201.32, "response_clip_ratio": 0.05625, "reward": 1.341243851184845, "reward_std": 0.37055844664573667, "rewards/CosineReward": 0.6274715602397919, "rewards/Format": 0.76875, "rewards/RepetitionPenalty": -0.05497771985828877, "step": 40, "train_speed(iter/s)": 0.013074 }, { "epoch": 0.020202020202020204, "grad_norm": 0.07947830855846405, "learning_rate": 4.032258064516129e-06, "loss": 0.09274066686630249, "memory(GiB)": 201.32, "step": 50, "train_speed(iter/s)": 0.013131 }, { "epoch": 0.020202020202020204, "eval_clip_ratio": 0.0, "eval_completion_length": 1431.635, "eval_kl": 0.0014617919921875, "eval_loss": 0.0851428210735321, "eval_response_clip_ratio": 0.075, "eval_reward": 1.3496401262283326, "eval_reward_std": 0.37406040787696837, "eval_rewards/CosineReward": 0.6217813873291016, "eval_rewards/Format": 0.78125, "eval_rewards/RepetitionPenalty": -0.05339125469326973, "eval_runtime": 1594.5988, "eval_samples_per_second": 0.063, "eval_steps_per_second": 0.003, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 1380.6359375, "epoch": 0.024242424242424242, "grad_norm": 0.05946151912212372, "kl": 0.001180267333984375, "learning_rate": 4.838709677419355e-06, "loss": 0.1046899676322937, "memory(GiB)": 201.32, "response_clip_ratio": 0.0671875, "reward": 1.3121023535728455, "reward_std": 0.4145697370171547, "rewards/CosineReward": 0.6392273038625718, "rewards/Format": 0.725, "rewards/RepetitionPenalty": -0.05212494917213917, "step": 60, "train_speed(iter/s)": 0.009784 }, { "clip_ratio": 0.0, "completion_length": 1376.196875, "epoch": 0.028282828282828285, "grad_norm": 0.06961650401353836, "kl": 0.00160675048828125, "learning_rate": 5.645161290322582e-06, "loss": 0.10449786186218261, "memory(GiB)": 201.32, "response_clip_ratio": 0.065625, "reward": 1.339801287651062, "reward_std": 0.3858643054962158, "rewards/CosineReward": 0.6405475556850433, "rewards/Format": 0.753125, "rewards/RepetitionPenalty": -0.053871278464794156, "step": 70, "train_speed(iter/s)": 0.010181 }, { "clip_ratio": 0.0, "completion_length": 1407.60625, "epoch": 0.03232323232323232, "grad_norm": 0.07890625298023224, "kl": 0.003438568115234375, "learning_rate": 6.451612903225806e-06, "loss": 0.11079697608947754, "memory(GiB)": 201.32, "response_clip_ratio": 0.078125, "reward": 1.397366178035736, "reward_std": 0.3462020069360733, "rewards/CosineReward": 0.6307743966579438, "rewards/Format": 0.828125, "rewards/RepetitionPenalty": -0.06153322011232376, "step": 80, "train_speed(iter/s)": 0.010502 }, { "clip_ratio": 0.0, "completion_length": 1462.675, "epoch": 0.03636363636363636, "grad_norm": 0.06447512656450272, "kl": 0.003720855712890625, "learning_rate": 7.258064516129033e-06, "loss": 0.113616943359375, "memory(GiB)": 201.32, "response_clip_ratio": 0.08125, "reward": 1.3906286120414735, "reward_std": 0.34696930199861525, "rewards/CosineReward": 0.6157887756824494, "rewards/Format": 0.828125, "rewards/RepetitionPenalty": -0.053285162523388864, "step": 90, "train_speed(iter/s)": 0.010757 }, { "epoch": 0.04040404040404041, "grad_norm": 0.06723054498434067, "learning_rate": 8.064516129032258e-06, "loss": 0.10635331869125367, "memory(GiB)": 201.32, "step": 100, "train_speed(iter/s)": 0.010993 }, { "epoch": 0.04040404040404041, "eval_clip_ratio": 0.0, "eval_completion_length": 1431.08625, "eval_kl": 0.00452056884765625, "eval_loss": 0.11490967124700546, "eval_response_clip_ratio": 0.05, "eval_reward": 1.4277572107315064, "eval_reward_std": 0.3158085200190544, "eval_rewards/CosineReward": 0.6205263376235962, "eval_rewards/Format": 0.86125, "eval_rewards/RepetitionPenalty": -0.05401912465691566, "eval_runtime": 1599.9063, "eval_samples_per_second": 0.063, "eval_steps_per_second": 0.003, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 1381.040625, "epoch": 0.044444444444444446, "grad_norm": 0.06708762049674988, "kl": 1.6170688629150392, "learning_rate": 8.870967741935484e-06, "loss": 0.1910473108291626, "memory(GiB)": 201.32, "response_clip_ratio": 0.0390625, "reward": 1.4152386724948882, "reward_std": 0.30315608307719233, "rewards/CosineReward": 0.6367029249668121, "rewards/Format": 0.8359375, "rewards/RepetitionPenalty": -0.05740173757076263, "step": 110, "train_speed(iter/s)": 0.009616 }, { "clip_ratio": 0.0, "completion_length": 1406.2625, "epoch": 0.048484848484848485, "grad_norm": 0.07686583697795868, "kl": 0.00360107421875, "learning_rate": 9.67741935483871e-06, "loss": 0.10766146183013917, "memory(GiB)": 201.32, "response_clip_ratio": 0.0375, "reward": 1.470962393283844, "reward_std": 0.2528735466301441, "rewards/CosineReward": 0.628222519159317, "rewards/Format": 0.89375, "rewards/RepetitionPenalty": -0.051010148227214815, "step": 120, "train_speed(iter/s)": 0.00986 }, { "clip_ratio": 0.0, "completion_length": 1310.29375, "epoch": 0.052525252525252523, "grad_norm": 0.07152841985225677, "kl": 0.008576202392578124, "learning_rate": 9.999839293045841e-06, "loss": 0.11151189804077148, "memory(GiB)": 201.32, "response_clip_ratio": 0.0375, "reward": 1.4794305562973022, "reward_std": 0.29553089290857315, "rewards/CosineReward": 0.6588298320770264, "rewards/Format": 0.871875, "rewards/RepetitionPenalty": -0.05127429813146591, "step": 130, "train_speed(iter/s)": 0.010081 }, { "clip_ratio": 0.0, "completion_length": 1394.84375, "epoch": 0.05656565656565657, "grad_norm": 0.06534178555011749, "kl": 0.003376007080078125, "learning_rate": 9.998857232403944e-06, "loss": 0.10313808917999268, "memory(GiB)": 201.32, "response_clip_ratio": 0.053125, "reward": 1.4630812048912047, "reward_std": 0.29973402321338655, "rewards/CosineReward": 0.6330412924289703, "rewards/Format": 0.88125, "rewards/RepetitionPenalty": -0.05121009349822998, "step": 140, "train_speed(iter/s)": 0.010258 }, { "epoch": 0.06060606060606061, "grad_norm": 0.12454958260059357, "learning_rate": 9.996982567907083e-06, "loss": 0.108436119556427, "memory(GiB)": 201.32, "step": 150, "train_speed(iter/s)": 0.010448 }, { "epoch": 0.06060606060606061, "eval_clip_ratio": 0.0, "eval_completion_length": 1385.4125, "eval_kl": 0.0031256103515625, "eval_loss": 0.10945934057235718, "eval_response_clip_ratio": 0.03625, "eval_reward": 1.4737121772766113, "eval_reward_std": 0.27101251542568205, "eval_rewards/CosineReward": 0.6330605340003967, "eval_rewards/Format": 0.89125, "eval_rewards/RepetitionPenalty": -0.05059833467006683, "eval_runtime": 1597.4301, "eval_samples_per_second": 0.063, "eval_steps_per_second": 0.003, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 1358.803125, "epoch": 0.06464646464646465, "grad_norm": 0.07744200527667999, "kl": 0.004645538330078125, "learning_rate": 9.994215634298318e-06, "loss": 0.13394694328308104, "memory(GiB)": 201.32, "response_clip_ratio": 0.0328125, "reward": 1.4699846863746644, "reward_std": 0.25304545238614085, "rewards/CosineReward": 0.6336570367217064, "rewards/Format": 0.8859375, "rewards/RepetitionPenalty": -0.04960984513163567, "step": 160, "train_speed(iter/s)": 0.009583 }, { "clip_ratio": 0.0, "completion_length": 1366.4375, "epoch": 0.06868686868686869, "grad_norm": 0.09392488747835159, "kl": 0.0045501708984375, "learning_rate": 9.990556925645689e-06, "loss": 0.12707479000091554, "memory(GiB)": 201.32, "response_clip_ratio": 0.053125, "reward": 1.4583564758300782, "reward_std": 0.3112357288599014, "rewards/CosineReward": 0.641951459646225, "rewards/Format": 0.875, "rewards/RepetitionPenalty": -0.058594975247979166, "step": 170, "train_speed(iter/s)": 0.009751 }, { "clip_ratio": 0.0, "completion_length": 1331.265625, "epoch": 0.07272727272727272, "grad_norm": 0.08582872897386551, "kl": 0.00849761962890625, "learning_rate": 9.986007095254003e-06, "loss": 0.10804812908172608, "memory(GiB)": 201.32, "response_clip_ratio": 0.025, "reward": 1.5224240660667419, "reward_std": 0.21244690269231797, "rewards/CosineReward": 0.6498659372329711, "rewards/Format": 0.921875, "rewards/RepetitionPenalty": -0.04931688979268074, "step": 180, "train_speed(iter/s)": 0.009918 }, { "clip_ratio": 0.0, "completion_length": 1319.584375, "epoch": 0.07676767676767676, "grad_norm": 0.08151663094758987, "kl": 0.010662841796875, "learning_rate": 9.98056695554817e-06, "loss": 0.0835932433605194, "memory(GiB)": 201.32, "response_clip_ratio": 0.0125, "reward": 1.5347412347793579, "reward_std": 0.19547156170010566, "rewards/CosineReward": 0.6534979581832886, "rewards/Format": 0.928125, "rewards/RepetitionPenalty": -0.0468817338347435, "step": 190, "train_speed(iter/s)": 0.010083 }, { "epoch": 0.08080808080808081, "grad_norm": 0.09825429320335388, "learning_rate": 9.974237477928142e-06, "loss": 0.1302722692489624, "memory(GiB)": 201.32, "step": 200, "train_speed(iter/s)": 0.010223 }, { "epoch": 0.08080808080808081, "eval_clip_ratio": 0.0, "eval_completion_length": 1304.91875, "eval_kl": 0.0116064453125, "eval_loss": 0.11376148462295532, "eval_response_clip_ratio": 0.03375, "eval_reward": 1.4866490745544434, "eval_reward_std": 0.28797954961657524, "eval_rewards/CosineReward": 0.6583965635299682, "eval_rewards/Format": 0.88, "eval_rewards/RepetitionPenalty": -0.05174747854471207, "eval_runtime": 1497.4837, "eval_samples_per_second": 0.067, "eval_steps_per_second": 0.003, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 1278.5671875, "epoch": 0.08484848484848485, "grad_norm": 0.09233655780553818, "kl": 0.016357421875, "learning_rate": 9.967019792595458e-06, "loss": 0.13105809688568115, "memory(GiB)": 201.32, "response_clip_ratio": 0.028125, "reward": 1.5280364274978637, "reward_std": 0.24829243943095208, "rewards/CosineReward": 0.668746542930603, "rewards/Format": 0.9078125, "rewards/RepetitionPenalty": -0.048522604070603846, "step": 210, "train_speed(iter/s)": 0.009624 }, { "clip_ratio": 0.0, "completion_length": 1158.809375, "epoch": 0.08888888888888889, "grad_norm": 0.10016483068466187, "kl": 0.02237548828125, "learning_rate": 9.958915188351423e-06, "loss": 0.13057438135147095, "memory(GiB)": 201.32, "response_clip_ratio": 0.0125, "reward": 1.5936293482780457, "reward_std": 0.1992014855146408, "rewards/CosineReward": 0.7059894382953644, "rewards/Format": 0.934375, "rewards/RepetitionPenalty": -0.04673512801527977, "step": 220, "train_speed(iter/s)": 0.009777 }, { "clip_ratio": 0.0, "completion_length": 1245.665625, "epoch": 0.09292929292929293, "grad_norm": 0.10577544569969177, "kl": 0.0184814453125, "learning_rate": 9.949925112366996e-06, "loss": 0.11538821458816528, "memory(GiB)": 201.32, "response_clip_ratio": 0.021875, "reward": 1.5218712568283081, "reward_std": 0.2517738312482834, "rewards/CosineReward": 0.6763593494892121, "rewards/Format": 0.9, "rewards/RepetitionPenalty": -0.05448807217180729, "step": 230, "train_speed(iter/s)": 0.009912 }, { "clip_ratio": 0.0, "completion_length": 1192.153125, "epoch": 0.09696969696969697, "grad_norm": 0.10936521738767624, "kl": 0.03348388671875, "learning_rate": 9.940051169924363e-06, "loss": 0.12029192447662354, "memory(GiB)": 201.32, "response_clip_ratio": 0.003125, "reward": 1.5782066345214845, "reward_std": 0.2129870519042015, "rewards/CosineReward": 0.6932173430919647, "rewards/Format": 0.928125, "rewards/RepetitionPenalty": -0.04313567094504833, "step": 240, "train_speed(iter/s)": 0.010057 }, { "epoch": 0.10101010101010101, "grad_norm": 0.11618863791227341, "learning_rate": 9.929295124130306e-06, "loss": 0.11959065198898315, "memory(GiB)": 201.32, "step": 250, "train_speed(iter/s)": 0.010173 }, { "epoch": 0.10101010101010101, "eval_clip_ratio": 0.0, "eval_completion_length": 1210.21125, "eval_kl": 0.025126953125, "eval_loss": 0.11051134765148163, "eval_response_clip_ratio": 0.0125, "eval_reward": 1.569578037261963, "eval_reward_std": 0.2060703954100609, "eval_rewards/CosineReward": 0.6852084493637085, "eval_rewards/Format": 0.92875, "eval_rewards/RepetitionPenalty": -0.04438042297959328, "eval_runtime": 1423.4277, "eval_samples_per_second": 0.07, "eval_steps_per_second": 0.003, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 1207.4578125, "epoch": 0.10505050505050505, "grad_norm": 0.1024615466594696, "kl": 0.0247314453125, "learning_rate": 9.91765889560138e-06, "loss": 0.13996527194976807, "memory(GiB)": 201.32, "response_clip_ratio": 0.0109375, "reward": 1.5623638153076171, "reward_std": 0.23076082281768323, "rewards/CosineReward": 0.6895100384950638, "rewards/Format": 0.91875, "rewards/RepetitionPenalty": -0.04589622914791107, "step": 260, "train_speed(iter/s)": 0.009748 }, { "clip_ratio": 0.0, "completion_length": 1167.8125, "epoch": 0.10909090909090909, "grad_norm": 0.10518812388181686, "kl": 0.03485107421875, "learning_rate": 9.905144562120954e-06, "loss": 0.1367172122001648, "memory(GiB)": 201.32, "response_clip_ratio": 0.0125, "reward": 1.5755935907363892, "reward_std": 0.22920711785554887, "rewards/CosineReward": 0.70100417137146, "rewards/Format": 0.921875, "rewards/RepetitionPenalty": -0.04728559516370297, "step": 270, "train_speed(iter/s)": 0.009876 }, { "clip_ratio": 0.0, "completion_length": 1117.24375, "epoch": 0.11313131313131314, "grad_norm": 0.10396507382392883, "kl": 0.0427978515625, "learning_rate": 9.891754358268213e-06, "loss": 0.11641757488250733, "memory(GiB)": 201.32, "response_clip_ratio": 0.00625, "reward": 1.613683557510376, "reward_std": 0.19217313975095748, "rewards/CosineReward": 0.7183997511863709, "rewards/Format": 0.940625, "rewards/RepetitionPenalty": -0.045341220870614055, "step": 280, "train_speed(iter/s)": 0.010007 }, { "clip_ratio": 0.0, "completion_length": 1093.9875, "epoch": 0.11717171717171718, "grad_norm": 0.12279438227415085, "kl": 0.046435546875, "learning_rate": 9.877490675019137e-06, "loss": 0.13355908393859864, "memory(GiB)": 201.32, "response_clip_ratio": 0.00625, "reward": 1.6440497517585755, "reward_std": 0.16423990577459335, "rewards/CosineReward": 0.7270485639572144, "rewards/Format": 0.959375, "rewards/RepetitionPenalty": -0.04237380102276802, "step": 290, "train_speed(iter/s)": 0.010138 }, { "epoch": 0.12121212121212122, "grad_norm": 0.14417685568332672, "learning_rate": 9.862356059319571e-06, "loss": 0.12106183767318726, "memory(GiB)": 201.32, "step": 300, "train_speed(iter/s)": 0.010251 }, { "epoch": 0.12121212121212122, "eval_clip_ratio": 0.0, "eval_completion_length": 1090.1175, "eval_kl": 0.0566796875, "eval_loss": 0.11904594302177429, "eval_response_clip_ratio": 0.0025, "eval_reward": 1.6312160968780518, "eval_reward_std": 0.18964565873146058, "eval_rewards/CosineReward": 0.7258864951133728, "eval_rewards/Format": 0.94375, "eval_rewards/RepetitionPenalty": -0.03842037230730057, "eval_runtime": 1274.6744, "eval_samples_per_second": 0.078, "eval_steps_per_second": 0.003, "step": 300 }, { "clip_ratio": 0.0, "completion_length": 1072.3796875, "epoch": 0.12525252525252525, "grad_norm": 0.1258593499660492, "kl": 0.06575927734375, "learning_rate": 9.846353213630435e-06, "loss": 0.10835564136505127, "memory(GiB)": 201.32, "response_clip_ratio": 0.0046875, "reward": 1.5897472143173217, "reward_std": 0.23233521170914173, "rewards/CosineReward": 0.7244264006614685, "rewards/Format": 0.9078125, "rewards/RepetitionPenalty": -0.04249171484261751, "step": 310, "train_speed(iter/s)": 0.009958 }, { "clip_ratio": 0.0, "completion_length": 977.3625, "epoch": 0.1292929292929293, "grad_norm": 0.17092444002628326, "kl": 0.07841796875, "learning_rate": 9.82948499544517e-06, "loss": 0.12514870166778563, "memory(GiB)": 201.32, "response_clip_ratio": 0.009375, "reward": 1.6661622881889344, "reward_std": 0.21618551313877105, "rewards/CosineReward": 0.7686496555805207, "rewards/Format": 0.934375, "rewards/RepetitionPenalty": -0.03686234261840582, "step": 320, "train_speed(iter/s)": 0.01009 }, { "clip_ratio": 0.0, "completion_length": 1035.321875, "epoch": 0.13333333333333333, "grad_norm": 0.13263164460659027, "kl": 0.081396484375, "learning_rate": 9.8117544167795e-06, "loss": 0.08669453859329224, "memory(GiB)": 201.32, "response_clip_ratio": 0.003125, "reward": 1.6461937308311463, "reward_std": 0.17310074269771575, "rewards/CosineReward": 0.746637761592865, "rewards/Format": 0.934375, "rewards/RepetitionPenalty": -0.03481901176273823, "step": 330, "train_speed(iter/s)": 0.010227 }, { "clip_ratio": 0.0, "completion_length": 1039.71875, "epoch": 0.13737373737373737, "grad_norm": 0.13215574622154236, "kl": 0.1201171875, "learning_rate": 9.793164643633595e-06, "loss": 0.12218025922775269, "memory(GiB)": 201.32, "response_clip_ratio": 0.003125, "reward": 1.6522927284240723, "reward_std": 0.1630481503903866, "rewards/CosineReward": 0.7451346576213836, "rewards/Format": 0.94375, "rewards/RepetitionPenalty": -0.03659196365624666, "step": 340, "train_speed(iter/s)": 0.010358 }, { "epoch": 0.1414141414141414, "grad_norm": 0.9451874494552612, "learning_rate": 9.773718995426757e-06, "loss": 0.11296249628067016, "memory(GiB)": 201.32, "step": 350, "train_speed(iter/s)": 0.010492 }, { "epoch": 0.1414141414141414, "eval_clip_ratio": 0.0, "eval_completion_length": 1012.01375, "eval_kl": 0.09197265625, "eval_loss": 0.10980751365423203, "eval_response_clip_ratio": 0.00875, "eval_reward": 1.6614595317840577, "eval_reward_std": 0.1891869069635868, "eval_rewards/CosineReward": 0.7551157426834106, "eval_rewards/Format": 0.94, "eval_rewards/RepetitionPenalty": -0.03365620955824852, "eval_runtime": 1241.9184, "eval_samples_per_second": 0.081, "eval_steps_per_second": 0.003, "step": 350 }, { "clip_ratio": 0.0, "completion_length": 980.084375, "epoch": 0.14545454545454545, "grad_norm": 0.1352473646402359, "kl": 0.1057861328125, "learning_rate": 9.75342094440469e-06, "loss": 0.13101534843444823, "memory(GiB)": 201.32, "response_clip_ratio": 0.003125, "reward": 1.668820822238922, "reward_std": 0.17138062026351691, "rewards/CosineReward": 0.7667600721120834, "rewards/Format": 0.934375, "rewards/RepetitionPenalty": -0.03231424409896135, "step": 360, "train_speed(iter/s)": 0.010231 }, { "clip_ratio": 0.0, "completion_length": 919.353125, "epoch": 0.1494949494949495, "grad_norm": 0.19250620901584625, "kl": 0.112060546875, "learning_rate": 9.732274115019496e-06, "loss": 0.12073533535003662, "memory(GiB)": 201.32, "response_clip_ratio": 0.003125, "reward": 1.7050742745399474, "reward_std": 0.16779937073588372, "rewards/CosineReward": 0.7879343628883362, "rewards/Format": 0.95, "rewards/RepetitionPenalty": -0.03286007363349199, "step": 370, "train_speed(iter/s)": 0.010362 }, { "clip_ratio": 0.0, "completion_length": 860.453125, "epoch": 0.15353535353535352, "grad_norm": 0.16778206825256348, "kl": 0.15791015625, "learning_rate": 9.710282283282476e-06, "loss": 0.1197171688079834, "memory(GiB)": 201.32, "response_clip_ratio": 0.003125, "reward": 1.7079741597175597, "reward_std": 0.1786061268299818, "rewards/CosineReward": 0.8102583944797516, "rewards/Format": 0.928125, "rewards/RepetitionPenalty": -0.030409247800707816, "step": 380, "train_speed(iter/s)": 0.010495 }, { "clip_ratio": 0.0, "completion_length": 864.534375, "epoch": 0.15757575757575756, "grad_norm": 0.19480502605438232, "kl": 0.19189453125, "learning_rate": 9.687449376089898e-06, "loss": 0.11495430469512939, "memory(GiB)": 201.32, "response_clip_ratio": 0.003125, "reward": 1.7451889276504517, "reward_std": 0.12189406082034111, "rewards/CosineReward": 0.809339451789856, "rewards/Format": 0.965625, "rewards/RepetitionPenalty": -0.02977552227675915, "step": 390, "train_speed(iter/s)": 0.010629 }, { "epoch": 0.16161616161616163, "grad_norm": 0.1948593109846115, "learning_rate": 9.663779470521788e-06, "loss": 0.10618784427642822, "memory(GiB)": 201.32, "step": 400, "train_speed(iter/s)": 0.01076 }, { "epoch": 0.16161616161616163, "eval_clip_ratio": 0.0, "eval_completion_length": 860.40375, "eval_kl": 0.9400390625, "eval_loss": 0.13976825773715973, "eval_response_clip_ratio": 0.00375, "eval_reward": 1.7388446187973023, "eval_reward_std": 0.12333237871527672, "eval_rewards/CosineReward": 0.8105894351005554, "eval_rewards/Format": 0.955, "eval_rewards/RepetitionPenalty": -0.0267448078840971, "eval_runtime": 1007.3718, "eval_samples_per_second": 0.099, "eval_steps_per_second": 0.004, "step": 400 } ], "logging_steps": 10, "max_steps": 2475, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }