{ "best_metric": 0.32666667327284815, "best_model_checkpoint": "/mnt/data/user/zhao_jun/tangjixin/output/model/intern2.5vl-7b-grpo_v2/v8-20250328-093218/checkpoint-2475", "epoch": 1.0, "eval_steps": 250, "global_step": 2475, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 409.79168701171875, "epoch": 0.00040404040404040404, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.6129032258064515e-09, "loss": 0.0, "memory(GiB)": 53.97, "response_clip_ratio": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/MultiModalAccuracyORM": 0.0, "step": 1, "train_speed(iter/s)": 0.01394 }, { "clip_ratio": 0.0, "completion_length": 280.04167318344116, "epoch": 0.00202020202020202, "grad_norm": 0.8363032478879351, "kl": 0.0014553563960362226, "learning_rate": 8.064516129032257e-09, "loss": 0.02412133663892746, "memory(GiB)": 66.4, "response_clip_ratio": 0.0, "reward": 0.13541666977107525, "reward_std": 0.2260890230536461, "rewards/MultiModalAccuracyORM": 0.13541666977107525, "step": 5, "train_speed(iter/s)": 0.028028 }, { "clip_ratio": 0.0, "completion_length": 310.8333435058594, "epoch": 0.00404040404040404, "grad_norm": 0.6139540103661346, "kl": 0.0016570288338698448, "learning_rate": 1.6129032258064514e-08, "loss": -0.01782941520214081, "memory(GiB)": 66.57, "response_clip_ratio": 0.0, "reward": 0.08333333507180214, "reward_std": 0.20967912971973418, "rewards/MultiModalAccuracyORM": 0.08333333507180214, "step": 10, "train_speed(iter/s)": 0.031112 }, { "clip_ratio": 0.0, "completion_length": 388.75834503173826, "epoch": 0.006060606060606061, "grad_norm": 0.40096093890994094, "kl": 0.001698582514654845, "learning_rate": 2.4193548387096773e-08, "loss": 0.026962581276893615, "memory(GiB)": 67.01, "response_clip_ratio": 0.00833333358168602, "reward": 0.2833333432674408, "reward_std": 0.3393357157707214, "rewards/MultiModalAccuracyORM": 0.2833333432674408, "step": 15, "train_speed(iter/s)": 0.031299 }, { "clip_ratio": 0.0, "completion_length": 297.6416748046875, "epoch": 0.00808080808080808, "grad_norm": 1.4739542997288975, "kl": 0.0019028475042432546, "learning_rate": 3.225806451612903e-08, "loss": 0.038644880056381226, "memory(GiB)": 67.01, "response_clip_ratio": 0.01666666716337204, "reward": 0.2666666753590107, "reward_std": 0.2996539086103439, "rewards/MultiModalAccuracyORM": 0.2666666753590107, "step": 20, "train_speed(iter/s)": 0.031377 }, { "clip_ratio": 0.0, "completion_length": 374.5166717529297, "epoch": 0.010101010101010102, "grad_norm": 0.7741728453598145, "kl": 0.0016279776813462377, "learning_rate": 4.032258064516129e-08, "loss": 0.004998515546321869, "memory(GiB)": 67.01, "response_clip_ratio": 0.01666666716337204, "reward": 0.2416666716337204, "reward_std": 0.3144540905952454, "rewards/MultiModalAccuracyORM": 0.2416666716337204, "step": 25, "train_speed(iter/s)": 0.030159 }, { "clip_ratio": 0.0, "completion_length": 379.8333404541016, "epoch": 0.012121212121212121, "grad_norm": 0.9701247553439714, "kl": 0.0015773880179040134, "learning_rate": 4.8387096774193546e-08, "loss": -0.0044724434614181515, "memory(GiB)": 67.01, "response_clip_ratio": 0.03333333432674408, "reward": 0.2500000067055225, "reward_std": 0.35868159830570223, "rewards/MultiModalAccuracyORM": 0.2500000067055225, "step": 30, "train_speed(iter/s)": 0.030283 }, { "clip_ratio": 0.0, "completion_length": 357.30834197998047, "epoch": 0.014141414141414142, "grad_norm": 1.1959341136654718, "kl": 0.001618355477694422, "learning_rate": 5.645161290322581e-08, "loss": 0.023464329540729523, "memory(GiB)": 67.01, "response_clip_ratio": 0.03333333432674408, "reward": 0.20833333805203438, "reward_std": 0.2963388442993164, "rewards/MultiModalAccuracyORM": 0.20833333805203438, "step": 35, "train_speed(iter/s)": 0.030014 }, { "clip_ratio": 0.0, "completion_length": 301.2083457946777, "epoch": 0.01616161616161616, "grad_norm": 1.410798812581158, "kl": 0.0019752797903493046, "learning_rate": 6.451612903225806e-08, "loss": -0.0007259666919708252, "memory(GiB)": 67.01, "response_clip_ratio": 0.0, "reward": 0.19166667088866235, "reward_std": 0.33526378870010376, "rewards/MultiModalAccuracyORM": 0.19166667088866235, "step": 40, "train_speed(iter/s)": 0.031276 }, { "clip_ratio": 0.0, "completion_length": 453.8916778564453, "epoch": 0.01818181818181818, "grad_norm": 0.7834115303183283, "kl": 0.0015114007983356714, "learning_rate": 7.258064516129032e-08, "loss": 0.02698530852794647, "memory(GiB)": 67.01, "response_clip_ratio": 0.00833333358168602, "reward": 0.13333333879709244, "reward_std": 0.2323044866323471, "rewards/MultiModalAccuracyORM": 0.13333333879709244, "step": 45, "train_speed(iter/s)": 0.030831 }, { "clip_ratio": 0.0, "completion_length": 399.0833435058594, "epoch": 0.020202020202020204, "grad_norm": 0.5949445172203021, "kl": 0.0016400692868046463, "learning_rate": 8.064516129032257e-08, "loss": 0.00902385413646698, "memory(GiB)": 67.01, "response_clip_ratio": 0.01666666716337204, "reward": 0.1166666679084301, "reward_std": 0.22297748029232026, "rewards/MultiModalAccuracyORM": 0.1166666679084301, "step": 50, "train_speed(iter/s)": 0.030983 }, { "clip_ratio": 0.0, "completion_length": 283.9916702270508, "epoch": 0.022222222222222223, "grad_norm": 0.8683855913813096, "kl": 0.001812657283153385, "learning_rate": 8.870967741935484e-08, "loss": 0.020981660485267638, "memory(GiB)": 67.01, "response_clip_ratio": 0.0, "reward": 0.1916666693985462, "reward_std": 0.30718872845172884, "rewards/MultiModalAccuracyORM": 0.1916666693985462, "step": 55, "train_speed(iter/s)": 0.031708 }, { "clip_ratio": 0.0, "completion_length": 233.82500686645508, "epoch": 0.024242424242424242, "grad_norm": 0.7530936756741837, "kl": 0.001994300523074344, "learning_rate": 9.677419354838709e-08, "loss": -0.0005262017250061036, "memory(GiB)": 67.01, "response_clip_ratio": 0.00833333358168602, "reward": 0.2583333432674408, "reward_std": 0.3782250702381134, "rewards/MultiModalAccuracyORM": 0.2583333432674408, "step": 60, "train_speed(iter/s)": 0.03245 }, { "clip_ratio": 0.0, "completion_length": 479.90834655761716, "epoch": 0.026262626262626262, "grad_norm": 1.067432975947567, "kl": 0.0014999201346654444, "learning_rate": 1.0483870967741934e-07, "loss": 0.016191774606704713, "memory(GiB)": 67.01, "response_clip_ratio": 0.041666668653488156, "reward": 0.21666667386889457, "reward_std": 0.3534030467271805, "rewards/MultiModalAccuracyORM": 0.21666667386889457, "step": 65, "train_speed(iter/s)": 0.032048 }, { "clip_ratio": 0.0, "completion_length": 305.74167785644534, "epoch": 0.028282828282828285, "grad_norm": 0.6557989849641392, "kl": 0.0015661009470932185, "learning_rate": 1.1290322580645162e-07, "loss": 0.00013453364372253417, "memory(GiB)": 67.01, "response_clip_ratio": 0.0, "reward": 0.12500000223517418, "reward_std": 0.2367905855178833, "rewards/MultiModalAccuracyORM": 0.12500000223517418, "step": 70, "train_speed(iter/s)": 0.032349 }, { "clip_ratio": 0.0, "completion_length": 254.53333625793456, "epoch": 0.030303030303030304, "grad_norm": 1.4990729586396685, "kl": 0.001931124395923689, "learning_rate": 1.2096774193548387e-07, "loss": -0.014107623696327209, "memory(GiB)": 67.01, "response_clip_ratio": 0.0, "reward": 0.16666667237877847, "reward_std": 0.28758862614631653, "rewards/MultiModalAccuracyORM": 0.16666667237877847, "step": 75, "train_speed(iter/s)": 0.033073 }, { "clip_ratio": 0.0, "completion_length": 363.6750129699707, "epoch": 0.03232323232323232, "grad_norm": 0.009474604451451558, "kl": 0.0016073725128080696, "learning_rate": 1.2903225806451611e-07, "loss": 0.02208370268344879, "memory(GiB)": 67.01, "response_clip_ratio": 0.00833333358168602, "reward": 0.13333333507180214, "reward_std": 0.214479061961174, "rewards/MultiModalAccuracyORM": 0.13333333507180214, "step": 80, "train_speed(iter/s)": 0.032802 }, { "clip_ratio": 0.0, "completion_length": 266.858341217041, "epoch": 0.03434343434343434, "grad_norm": 0.8129277425663378, "kl": 0.0015607591019943356, "learning_rate": 1.3709677419354838e-07, "loss": 0.03339255452156067, "memory(GiB)": 67.01, "response_clip_ratio": 0.0, "reward": 0.2083333395421505, "reward_std": 0.27393454909324644, "rewards/MultiModalAccuracyORM": 0.2083333395421505, "step": 85, "train_speed(iter/s)": 0.033288 }, { "clip_ratio": 0.0, "completion_length": 328.29166717529296, "epoch": 0.03636363636363636, "grad_norm": 1.0237992843062258, "kl": 0.0018175460281781852, "learning_rate": 1.4516129032258064e-07, "loss": -0.0047673434019088745, "memory(GiB)": 67.01, "response_clip_ratio": 0.0, "reward": 0.19166667237877846, "reward_std": 0.31222184002399445, "rewards/MultiModalAccuracyORM": 0.19166667237877846, "step": 90, "train_speed(iter/s)": 0.02855 }, { "clip_ratio": 0.0, "completion_length": 390.2666778564453, "epoch": 0.03838383838383838, "grad_norm": 1.0063561945582478, "kl": 0.0016310916107613593, "learning_rate": 1.5322580645161288e-07, "loss": 0.012225335836410523, "memory(GiB)": 67.01, "response_clip_ratio": 0.0, "reward": 0.15833333805203437, "reward_std": 0.31040860116481783, "rewards/MultiModalAccuracyORM": 0.15833333805203437, "step": 95, "train_speed(iter/s)": 0.028934 }, { "clip_ratio": 0.0, "completion_length": 347.8333480834961, "epoch": 0.04040404040404041, "grad_norm": 0.6258205936576121, "kl": 0.0016133204102516175, "learning_rate": 1.6129032258064515e-07, "loss": -0.03874449729919434, "memory(GiB)": 67.01, "response_clip_ratio": 0.00833333358168602, "reward": 0.21666667386889457, "reward_std": 0.24935851097106934, "rewards/MultiModalAccuracyORM": 0.21666667386889457, "step": 100, "train_speed(iter/s)": 0.029069 }, { "clip_ratio": 0.0, "completion_length": 356.62500915527346, "epoch": 0.04242424242424243, "grad_norm": 0.7939711873436608, "kl": 0.0018501532729715108, "learning_rate": 1.6935483870967741e-07, "loss": -0.03681076169013977, "memory(GiB)": 67.01, "response_clip_ratio": 0.00833333358168602, "reward": 0.14166667088866233, "reward_std": 0.30009694397449493, "rewards/MultiModalAccuracyORM": 0.14166667088866233, "step": 105, "train_speed(iter/s)": 0.029156 }, { "clip_ratio": 0.0, "completion_length": 334.25000762939453, "epoch": 0.044444444444444446, "grad_norm": 0.8077469947790702, "kl": 0.001749929750803858, "learning_rate": 1.7741935483870968e-07, "loss": 0.0030364990234375, "memory(GiB)": 67.01, "response_clip_ratio": 0.0, "reward": 0.1916666716337204, "reward_std": 0.36744636595249175, "rewards/MultiModalAccuracyORM": 0.1916666716337204, "step": 110, "train_speed(iter/s)": 0.029419 }, { "clip_ratio": 0.0, "completion_length": 322.3250114440918, "epoch": 0.046464646464646465, "grad_norm": 0.6490437761753438, "kl": 0.0021277177263982596, "learning_rate": 1.8548387096774192e-07, "loss": 0.0014735162258148193, "memory(GiB)": 67.01, "response_clip_ratio": 0.00833333358168602, "reward": 0.20000000298023224, "reward_std": 0.3026430279016495, "rewards/MultiModalAccuracyORM": 0.20000000298023224, "step": 115, "train_speed(iter/s)": 0.029726 }, { "clip_ratio": 0.0, "completion_length": 356.07501220703125, "epoch": 0.048484848484848485, "grad_norm": 0.7766724930460716, "kl": 0.0019528187229298055, "learning_rate": 1.9354838709677418e-07, "loss": 0.006993652880191803, "memory(GiB)": 67.01, "response_clip_ratio": 0.0, "reward": 0.1916666693985462, "reward_std": 0.3041424334049225, "rewards/MultiModalAccuracyORM": 0.1916666693985462, "step": 120, "train_speed(iter/s)": 0.030043 }, { "clip_ratio": 0.0, "completion_length": 291.8666763305664, "epoch": 0.050505050505050504, "grad_norm": 1.7360957995530275, "kl": 0.0021437739836983384, "learning_rate": 2e-07, "loss": 0.07869662046432495, "memory(GiB)": 67.01, "response_clip_ratio": 0.0, "reward": 0.2500000067055225, "reward_std": 0.2706790864467621, "rewards/MultiModalAccuracyORM": 0.2500000067055225, "step": 125, "train_speed(iter/s)": 0.030399 }, { "clip_ratio": 0.0, "completion_length": 423.4416778564453, "epoch": 0.052525252525252523, "grad_norm": 0.7239088304012476, "kl": 0.001710877218283713, "learning_rate": 2e-07, "loss": 0.009808599948883057, "memory(GiB)": 67.01, "response_clip_ratio": 0.00833333358168602, "reward": 0.2666666768491268, "reward_std": 0.40890581607818605, "rewards/MultiModalAccuracyORM": 0.2666666768491268, "step": 130, "train_speed(iter/s)": 0.030332 }, { "clip_ratio": 0.0, "completion_length": 329.34167633056643, "epoch": 0.05454545454545454, "grad_norm": 0.8447168227598457, "kl": 0.002082096762023866, "learning_rate": 2e-07, "loss": -0.03216500878334046, "memory(GiB)": 67.01, "response_clip_ratio": 0.0, "reward": 0.2083333395421505, "reward_std": 0.3019101768732071, "rewards/MultiModalAccuracyORM": 0.2083333395421505, "step": 135, "train_speed(iter/s)": 0.030718 }, { "clip_ratio": 0.0, "completion_length": 324.9166748046875, "epoch": 0.05656565656565657, "grad_norm": 1.0324727992148468, "kl": 0.0018370800535194576, "learning_rate": 2e-07, "loss": 0.023554784059524537, "memory(GiB)": 67.01, "response_clip_ratio": 0.0, "reward": 0.20833333805203438, "reward_std": 0.33395901322364807, "rewards/MultiModalAccuracyORM": 0.20833333805203438, "step": 140, "train_speed(iter/s)": 0.030908 }, { "clip_ratio": 0.0, "completion_length": 248.10834045410155, "epoch": 0.05858585858585859, "grad_norm": 1.3871585996389504, "kl": 0.0021007918752729894, "learning_rate": 2e-07, "loss": 0.026919734477996827, "memory(GiB)": 67.01, "response_clip_ratio": 0.0, "reward": 0.21666666865348816, "reward_std": 0.21753989458084105, "rewards/MultiModalAccuracyORM": 0.21666666865348816, "step": 145, "train_speed(iter/s)": 0.031304 }, { "clip_ratio": 0.0, "completion_length": 289.16667556762695, "epoch": 0.06060606060606061, "grad_norm": 0.8132545647619184, "kl": 0.002322551829274744, "learning_rate": 2e-07, "loss": 0.07373623847961426, "memory(GiB)": 67.01, "response_clip_ratio": 0.0, "reward": 0.3083333447575569, "reward_std": 0.3354848504066467, "rewards/MultiModalAccuracyORM": 0.3083333447575569, "step": 150, "train_speed(iter/s)": 0.031477 }, { "clip_ratio": 0.0, "completion_length": 337.5250114440918, "epoch": 0.06262626262626263, "grad_norm": 1.3519945848264368, "kl": 0.002200189605355263, "learning_rate": 2e-07, "loss": 0.04400811195373535, "memory(GiB)": 67.01, "response_clip_ratio": 0.0, "reward": 0.13333333656191826, "reward_std": 0.29784067571163175, "rewards/MultiModalAccuracyORM": 0.13333333656191826, "step": 155, "train_speed(iter/s)": 0.031607 }, { "clip_ratio": 0.0, "completion_length": 362.05834312438964, "epoch": 0.06464646464646465, "grad_norm": 0.6312437669339288, "kl": 0.0019409565313253552, "learning_rate": 2e-07, "loss": -0.00390947014093399, "memory(GiB)": 67.01, "response_clip_ratio": 0.01666666716337204, "reward": 0.09166666865348816, "reward_std": 0.23854664266109465, "rewards/MultiModalAccuracyORM": 0.09166666865348816, "step": 160, "train_speed(iter/s)": 0.031439 }, { "clip_ratio": 0.0, "completion_length": 309.1583427429199, "epoch": 0.06666666666666667, "grad_norm": 0.9922303390769169, "kl": 0.0020790058420971035, "learning_rate": 2e-07, "loss": 0.02683091163635254, "memory(GiB)": 67.01, "response_clip_ratio": 0.0, "reward": 0.11666666939854622, "reward_std": 0.24010564982891083, "rewards/MultiModalAccuracyORM": 0.11666666939854622, "step": 165, "train_speed(iter/s)": 0.031658 }, { "clip_ratio": 0.0, "completion_length": 185.6333381652832, "epoch": 0.06868686868686869, "grad_norm": 1.8736558792915283, "kl": 0.0021017327206209304, "learning_rate": 2e-07, "loss": 0.053074592351913454, "memory(GiB)": 67.01, "response_clip_ratio": 0.0, "reward": 0.2416666753590107, "reward_std": 0.31345489621162415, "rewards/MultiModalAccuracyORM": 0.2416666753590107, "step": 170, "train_speed(iter/s)": 0.031906 }, { "clip_ratio": 0.0, "completion_length": 334.34167938232423, "epoch": 0.0707070707070707, "grad_norm": 0.369038153671658, "kl": 0.0021058263606391846, "learning_rate": 2e-07, "loss": -0.015150085091590881, "memory(GiB)": 67.01, "response_clip_ratio": 0.0, "reward": 0.19166667014360428, "reward_std": 0.2506715327501297, "rewards/MultiModalAccuracyORM": 0.19166667014360428, "step": 175, "train_speed(iter/s)": 0.032115 }, { "clip_ratio": 0.0, "completion_length": 292.1916778564453, "epoch": 0.07272727272727272, "grad_norm": 1.9587979389448584, "kl": 0.002187371510080993, "learning_rate": 2e-07, "loss": -0.07978157997131348, "memory(GiB)": 67.01, "response_clip_ratio": 0.0, "reward": 0.14166666939854622, "reward_std": 0.3041424334049225, "rewards/MultiModalAccuracyORM": 0.14166666939854622, "step": 180, "train_speed(iter/s)": 0.032328 }, { "clip_ratio": 0.0, "completion_length": 252.3166763305664, "epoch": 0.07474747474747474, "grad_norm": 1.8461501284258892, "kl": 0.002373928390443325, "learning_rate": 2e-07, "loss": 0.0032023414969444275, "memory(GiB)": 67.01, "response_clip_ratio": 0.0, "reward": 0.20833334028720857, "reward_std": 0.3078981190919876, "rewards/MultiModalAccuracyORM": 0.20833334028720857, "step": 185, "train_speed(iter/s)": 0.032577 }, { "clip_ratio": 0.0, "completion_length": 372.09167327880857, "epoch": 0.07676767676767676, "grad_norm": 1.161955111140227, "kl": 0.0020143969799391926, "learning_rate": 2e-07, "loss": 0.015145952999591827, "memory(GiB)": 67.01, "response_clip_ratio": 0.0, "reward": 0.14166667088866233, "reward_std": 0.22400068640708923, "rewards/MultiModalAccuracyORM": 0.14166667088866233, "step": 190, "train_speed(iter/s)": 0.032624 }, { "clip_ratio": 0.0, "completion_length": 301.4333442687988, "epoch": 0.07878787878787878, "grad_norm": 0.6821896150480984, "kl": 0.002065828931517899, "learning_rate": 2e-07, "loss": -0.008565062284469604, "memory(GiB)": 67.01, "response_clip_ratio": 0.00833333358168602, "reward": 0.20000000298023224, "reward_std": 0.21394325494766236, "rewards/MultiModalAccuracyORM": 0.20000000298023224, "step": 195, "train_speed(iter/s)": 0.032656 }, { "clip_ratio": 0.0, "completion_length": 377.93334503173827, "epoch": 0.08080808080808081, "grad_norm": 0.9729957971818891, "kl": 0.002064543019514531, "learning_rate": 2e-07, "loss": 0.05269354581832886, "memory(GiB)": 67.41, "response_clip_ratio": 0.02500000074505806, "reward": 0.21666667684912683, "reward_std": 0.27379952669143676, "rewards/MultiModalAccuracyORM": 0.21666667684912683, "step": 200, "train_speed(iter/s)": 0.032498 }, { "clip_ratio": 0.0, "completion_length": 366.07500915527345, "epoch": 0.08282828282828283, "grad_norm": 1.771257702172847, "kl": 0.0021668279776349665, "learning_rate": 2e-07, "loss": 0.05926605463027954, "memory(GiB)": 67.41, "response_clip_ratio": 0.00833333358168602, "reward": 0.14166666865348815, "reward_std": 0.29076993763446807, "rewards/MultiModalAccuracyORM": 0.14166666865348815, "step": 205, "train_speed(iter/s)": 0.032513 }, { "clip_ratio": 0.0, "completion_length": 312.17501373291014, "epoch": 0.08484848484848485, "grad_norm": 1.1847534793771106, "kl": 0.002231467212550342, "learning_rate": 2e-07, "loss": -0.04730735421180725, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.17500000596046447, "reward_std": 0.30665292739868166, "rewards/MultiModalAccuracyORM": 0.17500000596046447, "step": 210, "train_speed(iter/s)": 0.032602 }, { "clip_ratio": 0.0, "completion_length": 318.25001068115233, "epoch": 0.08686868686868687, "grad_norm": 1.0162794234808679, "kl": 0.0021812492050230503, "learning_rate": 2e-07, "loss": -0.012527593970298767, "memory(GiB)": 67.41, "response_clip_ratio": 0.01666666716337204, "reward": 0.25000000596046446, "reward_std": 0.2488823115825653, "rewards/MultiModalAccuracyORM": 0.25000000596046446, "step": 215, "train_speed(iter/s)": 0.032672 }, { "clip_ratio": 0.0, "completion_length": 325.2083404541016, "epoch": 0.08888888888888889, "grad_norm": 1.6052431990433658, "kl": 0.002427070902194828, "learning_rate": 2e-07, "loss": 0.04005146026611328, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2333333410322666, "reward_std": 0.31740519404411316, "rewards/MultiModalAccuracyORM": 0.2333333410322666, "step": 220, "train_speed(iter/s)": 0.032834 }, { "clip_ratio": 0.0, "completion_length": 322.5166763305664, "epoch": 0.09090909090909091, "grad_norm": 1.2096008908286064, "kl": 0.002002272638492286, "learning_rate": 2e-07, "loss": 0.07909151315689086, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2250000096857548, "reward_std": 0.3757145762443542, "rewards/MultiModalAccuracyORM": 0.2250000096857548, "step": 225, "train_speed(iter/s)": 0.032966 }, { "clip_ratio": 0.0, "completion_length": 252.65000610351564, "epoch": 0.09292929292929293, "grad_norm": 1.2828051744936808, "kl": 0.002529059338849038, "learning_rate": 2e-07, "loss": 0.03620143532752991, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.17500000447034836, "reward_std": 0.2792848199605942, "rewards/MultiModalAccuracyORM": 0.17500000447034836, "step": 230, "train_speed(iter/s)": 0.033148 }, { "clip_ratio": 0.0, "completion_length": 362.3000091552734, "epoch": 0.09494949494949495, "grad_norm": 1.085440875767882, "kl": 0.0019764827913604675, "learning_rate": 2e-07, "loss": 0.03204571008682251, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.21666667461395264, "reward_std": 0.34560188353061677, "rewards/MultiModalAccuracyORM": 0.21666667461395264, "step": 235, "train_speed(iter/s)": 0.033179 }, { "clip_ratio": 0.0, "completion_length": 292.416674041748, "epoch": 0.09696969696969697, "grad_norm": 0.49748232708825735, "kl": 0.002392634970601648, "learning_rate": 2e-07, "loss": -9.850338101387024e-05, "memory(GiB)": 67.41, "response_clip_ratio": 0.00833333358168602, "reward": 0.1666666679084301, "reward_std": 0.3134308844804764, "rewards/MultiModalAccuracyORM": 0.1666666679084301, "step": 240, "train_speed(iter/s)": 0.033246 }, { "clip_ratio": 0.0, "completion_length": 342.1333465576172, "epoch": 0.09898989898989899, "grad_norm": 1.2151595435030045, "kl": 0.0020633480802644045, "learning_rate": 2e-07, "loss": 0.04106523394584656, "memory(GiB)": 67.41, "response_clip_ratio": 0.02500000074505806, "reward": 0.3750000111758709, "reward_std": 0.3597048044204712, "rewards/MultiModalAccuracyORM": 0.3750000111758709, "step": 245, "train_speed(iter/s)": 0.033114 }, { "epoch": 0.10101010101010101, "grad_norm": 2.203412703075323, "learning_rate": 2e-07, "loss": -0.009947558492422104, "memory(GiB)": 67.41, "step": 250, "train_speed(iter/s)": 0.033255 }, { "epoch": 0.10101010101010101, "eval_clip_ratio": 0.0, "eval_completion_length": 313.73167510986326, "eval_kl": 0.002364178735297173, "eval_loss": 0.011137718334794044, "eval_response_clip_ratio": 0.001666666716337204, "eval_reward": 0.1716666702926159, "eval_reward_std": 0.30018057823181155, "eval_rewards/MultiModalAccuracyORM": 0.1716666702926159, "eval_runtime": 611.4996, "eval_samples_per_second": 0.082, "eval_steps_per_second": 0.008, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 307.2083404541016, "epoch": 0.10303030303030303, "grad_norm": 0.6772835244109896, "kl": 0.0024091241066344082, "learning_rate": 2e-07, "loss": -0.014972110092639924, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.23333334028720856, "reward_std": 0.28962627798318863, "rewards/MultiModalAccuracyORM": 0.23333334028720856, "step": 255, "train_speed(iter/s)": 0.029399 }, { "clip_ratio": 0.0, "completion_length": 229.6, "epoch": 0.10505050505050505, "grad_norm": 1.2025473761483534, "kl": 0.0028577180579304694, "learning_rate": 2e-07, "loss": 0.04020859003067016, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.3166666716337204, "reward_std": 0.34929499626159666, "rewards/MultiModalAccuracyORM": 0.3166666716337204, "step": 260, "train_speed(iter/s)": 0.029617 }, { "clip_ratio": 0.0, "completion_length": 337.6, "epoch": 0.10707070707070707, "grad_norm": 0.4120260278240223, "kl": 0.0021677285199984907, "learning_rate": 2e-07, "loss": -0.0139850914478302, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.17500000521540643, "reward_std": 0.30015655159950255, "rewards/MultiModalAccuracyORM": 0.17500000521540643, "step": 265, "train_speed(iter/s)": 0.029704 }, { "clip_ratio": 0.0, "completion_length": 366.85, "epoch": 0.10909090909090909, "grad_norm": 0.8561435874110225, "kl": 0.0018193130497820675, "learning_rate": 2e-07, "loss": 0.08367395997047425, "memory(GiB)": 67.41, "response_clip_ratio": 0.05, "reward": 0.07500000223517418, "reward_std": 0.22218745350837707, "rewards/MultiModalAccuracyORM": 0.07500000223517418, "step": 270, "train_speed(iter/s)": 0.029758 }, { "clip_ratio": 0.0, "completion_length": 251.2, "epoch": 0.1111111111111111, "grad_norm": 0.9014771216720453, "kl": 0.0027342547429725526, "learning_rate": 2e-07, "loss": -0.017020440101623534, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.25000000447034837, "reward_std": 0.30515109598636625, "rewards/MultiModalAccuracyORM": 0.25000000447034837, "step": 275, "train_speed(iter/s)": 0.029979 }, { "clip_ratio": 0.0, "completion_length": 314.9, "epoch": 0.11313131313131314, "grad_norm": 0.8605819537524286, "kl": 0.002405107906088233, "learning_rate": 2e-07, "loss": -0.021587955951690673, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.24166667610406875, "reward_std": 0.2674381673336029, "rewards/MultiModalAccuracyORM": 0.24166667610406875, "step": 280, "train_speed(iter/s)": 0.030012 }, { "clip_ratio": 0.0, "completion_length": 420.35, "epoch": 0.11515151515151516, "grad_norm": 0.8251215931120387, "kl": 0.0022269786451943217, "learning_rate": 2e-07, "loss": -0.009860058128833771, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.1833333380520344, "reward_std": 0.24239750802516938, "rewards/MultiModalAccuracyORM": 0.1833333380520344, "step": 285, "train_speed(iter/s)": 0.030091 }, { "clip_ratio": 0.0, "completion_length": 288.7, "epoch": 0.11717171717171718, "grad_norm": 1.2527152485469888, "kl": 0.0021428745938465, "learning_rate": 2e-07, "loss": -0.023031486570835112, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.16666667014360428, "reward_std": 0.32451152205467226, "rewards/MultiModalAccuracyORM": 0.16666667014360428, "step": 290, "train_speed(iter/s)": 0.030252 }, { "clip_ratio": 0.0, "completion_length": 313.55, "epoch": 0.1191919191919192, "grad_norm": 1.6313717819183706, "kl": 0.0029145212611183524, "learning_rate": 2e-07, "loss": 0.010453201830387115, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.1916666716337204, "reward_std": 0.36569273471832275, "rewards/MultiModalAccuracyORM": 0.1916666716337204, "step": 295, "train_speed(iter/s)": 0.030281 }, { "clip_ratio": 0.0, "completion_length": 409.95, "epoch": 0.12121212121212122, "grad_norm": 0.9763637277897765, "kl": 0.002011374046560377, "learning_rate": 2e-07, "loss": -0.004069572687149048, "memory(GiB)": 67.41, "response_clip_ratio": 0.05, "reward": 0.1416666716337204, "reward_std": 0.2574163258075714, "rewards/MultiModalAccuracyORM": 0.1416666716337204, "step": 300, "train_speed(iter/s)": 0.030274 }, { "clip_ratio": 0.0, "completion_length": 294.9, "epoch": 0.12323232323232323, "grad_norm": 0.6068062718184686, "kl": 0.002677905629388988, "learning_rate": 2e-07, "loss": 0.029740142822265624, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.22500000447034835, "reward_std": 0.3315081149339676, "rewards/MultiModalAccuracyORM": 0.22500000447034835, "step": 305, "train_speed(iter/s)": 0.030352 }, { "clip_ratio": 0.0, "completion_length": 346.2, "epoch": 0.12525252525252525, "grad_norm": 1.6191575053592702, "kl": 0.002568906731903553, "learning_rate": 2e-07, "loss": 0.014141106605529785, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.275000012665987, "reward_std": 0.3184880018234253, "rewards/MultiModalAccuracyORM": 0.275000012665987, "step": 310, "train_speed(iter/s)": 0.030382 }, { "clip_ratio": 0.0, "completion_length": 575.05, "epoch": 0.12727272727272726, "grad_norm": 0.8100491404341938, "kl": 0.0023127400781959295, "learning_rate": 2e-07, "loss": 0.03490907847881317, "memory(GiB)": 67.41, "response_clip_ratio": 0.05, "reward": 0.10833333656191826, "reward_std": 0.20343697369098662, "rewards/MultiModalAccuracyORM": 0.10833333656191826, "step": 315, "train_speed(iter/s)": 0.030396 }, { "clip_ratio": 0.0, "completion_length": 263.35, "epoch": 0.1292929292929293, "grad_norm": 0.013915602281916708, "kl": 0.00306427797768265, "learning_rate": 2e-07, "loss": -0.017041555047035216, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.15833333507180214, "reward_std": 0.23609575033187866, "rewards/MultiModalAccuracyORM": 0.15833333507180214, "step": 320, "train_speed(iter/s)": 0.030583 }, { "clip_ratio": 0.0, "completion_length": 382.05, "epoch": 0.13131313131313133, "grad_norm": 1.1033620456072635, "kl": 0.002842709410469979, "learning_rate": 2e-07, "loss": 0.011531709134578705, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2416666716337204, "reward_std": 0.3182693660259247, "rewards/MultiModalAccuracyORM": 0.2416666716337204, "step": 325, "train_speed(iter/s)": 0.030665 }, { "clip_ratio": 0.0, "completion_length": 308.55, "epoch": 0.13333333333333333, "grad_norm": 0.9537144351366293, "kl": 0.002481410140171647, "learning_rate": 2e-07, "loss": 0.0630490779876709, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.1666666693985462, "reward_std": 0.26196202635765076, "rewards/MultiModalAccuracyORM": 0.1666666693985462, "step": 330, "train_speed(iter/s)": 0.030751 }, { "clip_ratio": 0.0, "completion_length": 326.6, "epoch": 0.13535353535353536, "grad_norm": 1.8155099532753467, "kl": 0.0024575040792115034, "learning_rate": 2e-07, "loss": 0.0058914005756378176, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.22500000670552253, "reward_std": 0.3784552842378616, "rewards/MultiModalAccuracyORM": 0.22500000670552253, "step": 335, "train_speed(iter/s)": 0.030852 }, { "clip_ratio": 0.0, "completion_length": 386.25, "epoch": 0.13737373737373737, "grad_norm": 0.8060655227590112, "kl": 0.0027357690036296845, "learning_rate": 2e-07, "loss": 0.015557366609573364, "memory(GiB)": 67.41, "response_clip_ratio": 0.05, "reward": 0.2750000089406967, "reward_std": 0.2752989321947098, "rewards/MultiModalAccuracyORM": 0.2750000089406967, "step": 340, "train_speed(iter/s)": 0.030911 }, { "clip_ratio": 0.0, "completion_length": 399.25, "epoch": 0.1393939393939394, "grad_norm": 0.866681581232229, "kl": 0.0027076376718468964, "learning_rate": 2e-07, "loss": 0.04877374768257141, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.12500000223517418, "reward_std": 0.2750207006931305, "rewards/MultiModalAccuracyORM": 0.12500000223517418, "step": 345, "train_speed(iter/s)": 0.030975 }, { "clip_ratio": 0.0, "completion_length": 338.7, "epoch": 0.1414141414141414, "grad_norm": 1.2612207291126878, "kl": 0.0021965037449263036, "learning_rate": 2e-07, "loss": 0.01168801486492157, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.24166667237877845, "reward_std": 0.33686081171035764, "rewards/MultiModalAccuracyORM": 0.24166667237877845, "step": 350, "train_speed(iter/s)": 0.030946 }, { "clip_ratio": 0.0, "completion_length": 223.15, "epoch": 0.14343434343434344, "grad_norm": 1.4058521539207838, "kl": 0.0031185435480438175, "learning_rate": 2e-07, "loss": 0.04595511555671692, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.0916666679084301, "reward_std": 0.19717080593109132, "rewards/MultiModalAccuracyORM": 0.0916666679084301, "step": 355, "train_speed(iter/s)": 0.031022 }, { "clip_ratio": 0.0, "completion_length": 374.05, "epoch": 0.14545454545454545, "grad_norm": 1.2063444833151329, "kl": 0.002893015928566456, "learning_rate": 2e-07, "loss": 0.05137801170349121, "memory(GiB)": 67.41, "response_clip_ratio": 0.05, "reward": 0.1666666716337204, "reward_std": 0.3794462442398071, "rewards/MultiModalAccuracyORM": 0.1666666716337204, "step": 360, "train_speed(iter/s)": 0.031042 }, { "clip_ratio": 0.0, "completion_length": 347.3, "epoch": 0.14747474747474748, "grad_norm": 0.004092609073173449, "kl": 0.002910976018756628, "learning_rate": 2e-07, "loss": -0.07540136575698853, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.1833333343267441, "reward_std": 0.23933666944503784, "rewards/MultiModalAccuracyORM": 0.1833333343267441, "step": 365, "train_speed(iter/s)": 0.031092 }, { "clip_ratio": 0.0, "completion_length": 270.45, "epoch": 0.1494949494949495, "grad_norm": 1.9513061817753958, "kl": 0.002679864503443241, "learning_rate": 2e-07, "loss": -0.023768115043640136, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.23333334252238275, "reward_std": 0.28758862614631653, "rewards/MultiModalAccuracyORM": 0.23333334252238275, "step": 370, "train_speed(iter/s)": 0.031131 }, { "clip_ratio": 0.0, "completion_length": 220.4, "epoch": 0.15151515151515152, "grad_norm": 1.418865700350339, "kl": 0.002804583264514804, "learning_rate": 2e-07, "loss": -0.020401501655578615, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2166666716337204, "reward_std": 0.3634364664554596, "rewards/MultiModalAccuracyORM": 0.2166666716337204, "step": 375, "train_speed(iter/s)": 0.031222 }, { "clip_ratio": 0.0, "completion_length": 221.95, "epoch": 0.15353535353535352, "grad_norm": 0.5418336734188505, "kl": 0.002608964138198644, "learning_rate": 2e-07, "loss": 4.297494888305664e-05, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.20000000521540642, "reward_std": 0.2323044866323471, "rewards/MultiModalAccuracyORM": 0.20000000521540642, "step": 380, "train_speed(iter/s)": 0.031331 }, { "clip_ratio": 0.0, "completion_length": 252.3, "epoch": 0.15555555555555556, "grad_norm": 1.2938746411839903, "kl": 0.003260041878093034, "learning_rate": 2e-07, "loss": 0.004776376485824585, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.3000000104308128, "reward_std": 0.42925089299678804, "rewards/MultiModalAccuracyORM": 0.3000000104308128, "step": 385, "train_speed(iter/s)": 0.031397 }, { "clip_ratio": 0.0, "completion_length": 371.25, "epoch": 0.15757575757575756, "grad_norm": 0.5646363772035449, "kl": 0.003275243751704693, "learning_rate": 2e-07, "loss": -0.038579174876213075, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.15833334028720855, "reward_std": 0.27523933053016664, "rewards/MultiModalAccuracyORM": 0.15833334028720855, "step": 390, "train_speed(iter/s)": 0.031443 }, { "clip_ratio": 0.0, "completion_length": 445.15, "epoch": 0.1595959595959596, "grad_norm": 1.1555077391898336, "kl": 0.0027449760818853974, "learning_rate": 2e-07, "loss": 0.010327178239822387, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.28333333805203437, "reward_std": 0.4030417025089264, "rewards/MultiModalAccuracyORM": 0.28333333805203437, "step": 395, "train_speed(iter/s)": 0.031462 }, { "clip_ratio": 0.0, "completion_length": 257.0, "epoch": 0.16161616161616163, "grad_norm": 1.0521916035915964, "kl": 0.0038477353053167464, "learning_rate": 2e-07, "loss": -0.0054982278496026995, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.08333333432674409, "reward_std": 0.16830329298973085, "rewards/MultiModalAccuracyORM": 0.08333333432674409, "step": 400, "train_speed(iter/s)": 0.03148 }, { "clip_ratio": 0.0, "completion_length": 264.8, "epoch": 0.16363636363636364, "grad_norm": 1.407684002477615, "kl": 0.0036190941464155912, "learning_rate": 2e-07, "loss": -0.018236428499221802, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2500000074505806, "reward_std": 0.40632360279560087, "rewards/MultiModalAccuracyORM": 0.2500000074505806, "step": 405, "train_speed(iter/s)": 0.031569 }, { "clip_ratio": 0.0, "completion_length": 242.35, "epoch": 0.16565656565656567, "grad_norm": 0.013270915639793456, "kl": 0.0037169228307902813, "learning_rate": 2e-07, "loss": 0.009006601572036744, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.26666667237877845, "reward_std": 0.21775853037834167, "rewards/MultiModalAccuracyORM": 0.26666667237877845, "step": 410, "train_speed(iter/s)": 0.03167 }, { "clip_ratio": 0.0, "completion_length": 358.25, "epoch": 0.16767676767676767, "grad_norm": 0.018447250902146762, "kl": 0.0025411285692825913, "learning_rate": 2e-07, "loss": -0.013655924797058105, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.15833333805203437, "reward_std": 0.16696292161941528, "rewards/MultiModalAccuracyORM": 0.15833333805203437, "step": 415, "train_speed(iter/s)": 0.031744 }, { "clip_ratio": 0.0, "completion_length": 172.8, "epoch": 0.1696969696969697, "grad_norm": 0.9508473836871834, "kl": 0.004386395937763155, "learning_rate": 2e-07, "loss": 0.01687029004096985, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2583333380520344, "reward_std": 0.2925831705331802, "rewards/MultiModalAccuracyORM": 0.2583333380520344, "step": 420, "train_speed(iter/s)": 0.031886 }, { "clip_ratio": 0.0, "completion_length": 386.25, "epoch": 0.1717171717171717, "grad_norm": 0.6441750873812842, "kl": 0.0029796794056892394, "learning_rate": 2e-07, "loss": 0.03889042139053345, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.31666667461395265, "reward_std": 0.32771685123443606, "rewards/MultiModalAccuracyORM": 0.31666667461395265, "step": 425, "train_speed(iter/s)": 0.03187 }, { "clip_ratio": 0.0, "completion_length": 304.05, "epoch": 0.17373737373737375, "grad_norm": 1.178565169897863, "kl": 0.004055350879207253, "learning_rate": 2e-07, "loss": 0.024070069193840027, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.19166666865348816, "reward_std": 0.2719598561525345, "rewards/MultiModalAccuracyORM": 0.19166666865348816, "step": 430, "train_speed(iter/s)": 0.03191 }, { "clip_ratio": 0.0, "completion_length": 316.65, "epoch": 0.17575757575757575, "grad_norm": 1.4718997092641302, "kl": 0.00319979356136173, "learning_rate": 2e-07, "loss": 0.0438249945640564, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2166666716337204, "reward_std": 0.26502286493778227, "rewards/MultiModalAccuracyORM": 0.2166666716337204, "step": 435, "train_speed(iter/s)": 0.031962 }, { "clip_ratio": 0.0, "completion_length": 374.25, "epoch": 0.17777777777777778, "grad_norm": 1.2499621790542323, "kl": 0.003690016525797546, "learning_rate": 2e-07, "loss": 0.02377350926399231, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.23333333656191826, "reward_std": 0.3626674860715866, "rewards/MultiModalAccuracyORM": 0.23333333656191826, "step": 440, "train_speed(iter/s)": 0.032048 }, { "clip_ratio": 0.0, "completion_length": 308.65, "epoch": 0.1797979797979798, "grad_norm": 1.9383757905012418, "kl": 0.0035172241390682758, "learning_rate": 2e-07, "loss": -0.000668191909790039, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2500000111758709, "reward_std": 0.2651819050312042, "rewards/MultiModalAccuracyORM": 0.2500000111758709, "step": 445, "train_speed(iter/s)": 0.032084 }, { "clip_ratio": 0.0, "completion_length": 227.05, "epoch": 0.18181818181818182, "grad_norm": 1.3408808376367802, "kl": 0.004776520561426878, "learning_rate": 2e-07, "loss": 0.07315102815628052, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2250000074505806, "reward_std": 0.41088385283946993, "rewards/MultiModalAccuracyORM": 0.2250000074505806, "step": 450, "train_speed(iter/s)": 0.03213 }, { "clip_ratio": 0.0, "completion_length": 377.25, "epoch": 0.18383838383838383, "grad_norm": 0.7424759470710323, "kl": 0.002804637746885419, "learning_rate": 2e-07, "loss": -0.001922774314880371, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.18333333879709243, "reward_std": 0.32902405858039857, "rewards/MultiModalAccuracyORM": 0.18333333879709243, "step": 455, "train_speed(iter/s)": 0.03208 }, { "clip_ratio": 0.0, "completion_length": 310.6, "epoch": 0.18585858585858586, "grad_norm": 1.8334324625115481, "kl": 0.004147664201445878, "learning_rate": 2e-07, "loss": 0.021799880266189575, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.3333333425223827, "reward_std": 0.3004107713699341, "rewards/MultiModalAccuracyORM": 0.3333333425223827, "step": 460, "train_speed(iter/s)": 0.032139 }, { "clip_ratio": 0.0, "completion_length": 254.55, "epoch": 0.18787878787878787, "grad_norm": 1.041487897521913, "kl": 0.0034352200804278255, "learning_rate": 2e-07, "loss": -0.004481983184814453, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.21666667088866234, "reward_std": 0.2996539086103439, "rewards/MultiModalAccuracyORM": 0.21666667088866234, "step": 465, "train_speed(iter/s)": 0.032211 }, { "clip_ratio": 0.0, "completion_length": 253.45, "epoch": 0.1898989898989899, "grad_norm": 0.9482855673770706, "kl": 0.003838365920819342, "learning_rate": 2e-07, "loss": -0.013571098446846008, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.12500000223517418, "reward_std": 0.24640740752220153, "rewards/MultiModalAccuracyORM": 0.12500000223517418, "step": 470, "train_speed(iter/s)": 0.032291 }, { "clip_ratio": 0.0, "completion_length": 228.4, "epoch": 0.1919191919191919, "grad_norm": 1.3313266716504006, "kl": 0.004062736709602177, "learning_rate": 2e-07, "loss": 0.052185094356536864, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.36666667833924294, "reward_std": 0.350342208147049, "rewards/MultiModalAccuracyORM": 0.36666667833924294, "step": 475, "train_speed(iter/s)": 0.032317 }, { "clip_ratio": 0.0, "completion_length": 378.75, "epoch": 0.19393939393939394, "grad_norm": 0.006691860039934013, "kl": 0.003972473449539393, "learning_rate": 2e-07, "loss": 0.013308031857013703, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.25833333656191826, "reward_std": 0.2652770906686783, "rewards/MultiModalAccuracyORM": 0.25833333656191826, "step": 480, "train_speed(iter/s)": 0.032297 }, { "clip_ratio": 0.0, "completion_length": 279.95, "epoch": 0.19595959595959597, "grad_norm": 1.858828735648231, "kl": 0.0045673437649384144, "learning_rate": 2e-07, "loss": -0.08343450427055359, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.28333334252238274, "reward_std": 0.30661733746528624, "rewards/MultiModalAccuracyORM": 0.28333334252238274, "step": 485, "train_speed(iter/s)": 0.03234 }, { "clip_ratio": 0.0, "completion_length": 270.2, "epoch": 0.19797979797979798, "grad_norm": 0.9385618148678871, "kl": 0.0037567693390883504, "learning_rate": 2e-07, "loss": -0.016573160886764526, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.13333333730697633, "reward_std": 0.21394325494766236, "rewards/MultiModalAccuracyORM": 0.13333333730697633, "step": 490, "train_speed(iter/s)": 0.032454 }, { "clip_ratio": 0.0, "completion_length": 266.6, "epoch": 0.2, "grad_norm": 0.8533415655351878, "kl": 0.003521406790241599, "learning_rate": 2e-07, "loss": -0.019848501682281493, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2583333395421505, "reward_std": 0.3541334718465805, "rewards/MultiModalAccuracyORM": 0.2583333395421505, "step": 495, "train_speed(iter/s)": 0.032507 }, { "epoch": 0.20202020202020202, "grad_norm": 10.76251214333233, "learning_rate": 2e-07, "loss": 0.010219329595565796, "memory(GiB)": 67.41, "step": 500, "train_speed(iter/s)": 0.03255 }, { "epoch": 0.20202020202020202, "eval_clip_ratio": 0.0, "eval_completion_length": 343.88834259033206, "eval_kl": 0.0037221815134398637, "eval_loss": 0.033297207206487656, "eval_response_clip_ratio": 0.013333333432674408, "eval_reward": 0.2283333396911621, "eval_reward_std": 0.3360080027580261, "eval_rewards/MultiModalAccuracyORM": 0.2283333396911621, "eval_runtime": 614.5158, "eval_samples_per_second": 0.081, "eval_steps_per_second": 0.008, "step": 500 }, { "clip_ratio": 0.0, "completion_length": 296.325, "epoch": 0.20404040404040405, "grad_norm": 1.0491762446720787, "kl": 0.0045048539526760575, "learning_rate": 2e-07, "loss": -0.03126291036605835, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2125000037252903, "reward_std": 0.29145624935626985, "rewards/MultiModalAccuracyORM": 0.2125000037252903, "step": 505, "train_speed(iter/s)": 0.030627 }, { "clip_ratio": 0.0, "completion_length": 432.65, "epoch": 0.20606060606060606, "grad_norm": 1.2154663404881314, "kl": 0.005191830382682383, "learning_rate": 2e-07, "loss": 0.009676572680473328, "memory(GiB)": 67.41, "response_clip_ratio": 0.1, "reward": 0.2416666753590107, "reward_std": 0.26368249356746676, "rewards/MultiModalAccuracyORM": 0.2416666753590107, "step": 510, "train_speed(iter/s)": 0.030645 }, { "clip_ratio": 0.0, "completion_length": 322.7, "epoch": 0.2080808080808081, "grad_norm": 0.5041268253271711, "kl": 0.003948929556645453, "learning_rate": 2e-07, "loss": -0.03559762239456177, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.1416666679084301, "reward_std": 0.2159808874130249, "rewards/MultiModalAccuracyORM": 0.1416666679084301, "step": 515, "train_speed(iter/s)": 0.030611 }, { "clip_ratio": 0.0, "completion_length": 360.1, "epoch": 0.2101010101010101, "grad_norm": 0.29702395537873283, "kl": 0.00434970180504024, "learning_rate": 2e-07, "loss": -0.09598699808120728, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2750000059604645, "reward_std": 0.3237069517374039, "rewards/MultiModalAccuracyORM": 0.2750000059604645, "step": 520, "train_speed(iter/s)": 0.030675 }, { "clip_ratio": 0.0, "completion_length": 285.9, "epoch": 0.21212121212121213, "grad_norm": 1.4484696850763847, "kl": 0.0041591078508645294, "learning_rate": 2e-07, "loss": -0.06923834681510925, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2583333358168602, "reward_std": 0.3533048897981644, "rewards/MultiModalAccuracyORM": 0.2583333358168602, "step": 525, "train_speed(iter/s)": 0.030707 }, { "clip_ratio": 0.0, "completion_length": 270.0, "epoch": 0.21414141414141413, "grad_norm": 1.2132868650103246, "kl": 0.0032755408203229306, "learning_rate": 2e-07, "loss": -0.012829649448394775, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.21666667237877846, "reward_std": 0.24261614382267, "rewards/MultiModalAccuracyORM": 0.21666667237877846, "step": 530, "train_speed(iter/s)": 0.030765 }, { "clip_ratio": 0.0, "completion_length": 250.15, "epoch": 0.21616161616161617, "grad_norm": 1.3471895483550291, "kl": 0.004648885619826615, "learning_rate": 2e-07, "loss": -0.015677666664123534, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.25833334028720856, "reward_std": 0.3315081149339676, "rewards/MultiModalAccuracyORM": 0.25833334028720856, "step": 535, "train_speed(iter/s)": 0.030826 }, { "clip_ratio": 0.0, "completion_length": 270.25, "epoch": 0.21818181818181817, "grad_norm": 0.6603883596764876, "kl": 0.003572591207921505, "learning_rate": 2e-07, "loss": 0.0794254183769226, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.27500000298023225, "reward_std": 0.17775078415870665, "rewards/MultiModalAccuracyORM": 0.27500000298023225, "step": 540, "train_speed(iter/s)": 0.030827 }, { "clip_ratio": 0.0, "completion_length": 217.15, "epoch": 0.2202020202020202, "grad_norm": 1.1298566902251597, "kl": 0.0045941169140860435, "learning_rate": 2e-07, "loss": 0.021821698546409606, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.3833333432674408, "reward_std": 0.3860022217035294, "rewards/MultiModalAccuracyORM": 0.3833333432674408, "step": 545, "train_speed(iter/s)": 0.03093 }, { "clip_ratio": 0.0, "completion_length": 251.6, "epoch": 0.2222222222222222, "grad_norm": 1.9661316551950794, "kl": 0.004029618808999658, "learning_rate": 2e-07, "loss": -0.09334349632263184, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.1583333395421505, "reward_std": 0.30636311769485475, "rewards/MultiModalAccuracyORM": 0.1583333395421505, "step": 550, "train_speed(iter/s)": 0.030967 }, { "clip_ratio": 0.0, "completion_length": 336.95, "epoch": 0.22424242424242424, "grad_norm": 1.551239261425344, "kl": 0.0035113503108732402, "learning_rate": 2e-07, "loss": -0.020345258712768554, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.1416666701436043, "reward_std": 0.30789810717105864, "rewards/MultiModalAccuracyORM": 0.1416666701436043, "step": 555, "train_speed(iter/s)": 0.030967 }, { "clip_ratio": 0.0, "completion_length": 267.7, "epoch": 0.22626262626262628, "grad_norm": 0.6662733706778937, "kl": 0.003866145922802389, "learning_rate": 2e-07, "loss": 0.119044029712677, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2083333358168602, "reward_std": 0.24265173375606536, "rewards/MultiModalAccuracyORM": 0.2083333358168602, "step": 560, "train_speed(iter/s)": 0.031003 }, { "clip_ratio": 0.0, "completion_length": 308.85, "epoch": 0.22828282828282828, "grad_norm": 1.107656830931071, "kl": 0.0037969154422171415, "learning_rate": 2e-07, "loss": -0.007524615526199341, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.22500000298023223, "reward_std": 0.33081327974796293, "rewards/MultiModalAccuracyORM": 0.22500000298023223, "step": 565, "train_speed(iter/s)": 0.03102 }, { "clip_ratio": 0.0, "completion_length": 169.25, "epoch": 0.23030303030303031, "grad_norm": 1.7816660571542655, "kl": 0.00519141077529639, "learning_rate": 2e-07, "loss": 0.04047863185405731, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2333333373069763, "reward_std": 0.31963745057582854, "rewards/MultiModalAccuracyORM": 0.2333333373069763, "step": 570, "train_speed(iter/s)": 0.031108 }, { "clip_ratio": 0.0, "completion_length": 246.75, "epoch": 0.23232323232323232, "grad_norm": 1.2214979642804986, "kl": 0.004031882807612419, "learning_rate": 2e-07, "loss": 0.020588791370391844, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.1916666716337204, "reward_std": 0.38450281620025634, "rewards/MultiModalAccuracyORM": 0.1916666716337204, "step": 575, "train_speed(iter/s)": 0.031083 }, { "clip_ratio": 0.0, "completion_length": 360.75, "epoch": 0.23434343434343435, "grad_norm": 1.5078231957808115, "kl": 0.0030958396266214548, "learning_rate": 2e-07, "loss": 0.04003850221633911, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.266666679084301, "reward_std": 0.3370438635349274, "rewards/MultiModalAccuracyORM": 0.266666679084301, "step": 580, "train_speed(iter/s)": 0.031079 }, { "clip_ratio": 0.0, "completion_length": 246.9, "epoch": 0.23636363636363636, "grad_norm": 1.1609668181534438, "kl": 0.003774796542711556, "learning_rate": 2e-07, "loss": 0.03895624876022339, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.1250000014901161, "reward_std": 0.2518449932336807, "rewards/MultiModalAccuracyORM": 0.1250000014901161, "step": 585, "train_speed(iter/s)": 0.03114 }, { "clip_ratio": 0.0, "completion_length": 322.75, "epoch": 0.2383838383838384, "grad_norm": 1.3664975542154603, "kl": 0.0038781519746407867, "learning_rate": 2e-07, "loss": -0.012830546498298645, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2750000014901161, "reward_std": 0.20442162454128265, "rewards/MultiModalAccuracyORM": 0.2750000014901161, "step": 590, "train_speed(iter/s)": 0.031195 }, { "clip_ratio": 0.0, "completion_length": 343.25, "epoch": 0.2404040404040404, "grad_norm": 1.0360518178054594, "kl": 0.004115447495132684, "learning_rate": 2e-07, "loss": 0.042417135834693906, "memory(GiB)": 67.41, "response_clip_ratio": 0.05, "reward": 0.2333333395421505, "reward_std": 0.3581433713436127, "rewards/MultiModalAccuracyORM": 0.2333333395421505, "step": 595, "train_speed(iter/s)": 0.03125 }, { "clip_ratio": 0.0, "completion_length": 310.2, "epoch": 0.24242424242424243, "grad_norm": 1.3913775959095787, "kl": 0.0037250344757921994, "learning_rate": 2e-07, "loss": 0.00183790922164917, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.1416666701436043, "reward_std": 0.2775311887264252, "rewards/MultiModalAccuracyORM": 0.1416666701436043, "step": 600, "train_speed(iter/s)": 0.031261 }, { "clip_ratio": 0.0, "completion_length": 255.55, "epoch": 0.24444444444444444, "grad_norm": 0.440329365041974, "kl": 0.004550268652383238, "learning_rate": 2e-07, "loss": 0.005285969376564026, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2583333387970924, "reward_std": 0.3297544836997986, "rewards/MultiModalAccuracyORM": 0.2583333387970924, "step": 605, "train_speed(iter/s)": 0.031263 }, { "clip_ratio": 0.0, "completion_length": 280.9, "epoch": 0.24646464646464647, "grad_norm": 1.2624826502631048, "kl": 0.005133295292034745, "learning_rate": 2e-07, "loss": -0.055149185657501223, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.25833333730697633, "reward_std": 0.24640740752220153, "rewards/MultiModalAccuracyORM": 0.25833333730697633, "step": 610, "train_speed(iter/s)": 0.031347 }, { "clip_ratio": 0.0, "completion_length": 354.2, "epoch": 0.24848484848484848, "grad_norm": 0.012452326444885307, "kl": 0.003399366606026888, "learning_rate": 2e-07, "loss": 0.029164138436317443, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.1416666716337204, "reward_std": 0.19744904339313507, "rewards/MultiModalAccuracyORM": 0.1416666716337204, "step": 615, "train_speed(iter/s)": 0.031388 }, { "clip_ratio": 0.0, "completion_length": 272.55, "epoch": 0.2505050505050505, "grad_norm": 1.3165129641085085, "kl": 0.004156474373303354, "learning_rate": 2e-07, "loss": 0.0376417338848114, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.25000000894069674, "reward_std": 0.30035116970539094, "rewards/MultiModalAccuracyORM": 0.25000000894069674, "step": 620, "train_speed(iter/s)": 0.031401 }, { "clip_ratio": 0.0, "completion_length": 317.85, "epoch": 0.25252525252525254, "grad_norm": 1.4029841920807011, "kl": 0.003737919870764017, "learning_rate": 2e-07, "loss": 0.006714335083961487, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.1583333358168602, "reward_std": 0.28227151930332184, "rewards/MultiModalAccuracyORM": 0.1583333358168602, "step": 625, "train_speed(iter/s)": 0.031402 }, { "clip_ratio": 0.0, "completion_length": 210.7, "epoch": 0.2545454545454545, "grad_norm": 1.9019291244494156, "kl": 0.005253740306943655, "learning_rate": 2e-07, "loss": 0.013242076337337493, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.19166667088866235, "reward_std": 0.286027193069458, "rewards/MultiModalAccuracyORM": 0.19166667088866235, "step": 630, "train_speed(iter/s)": 0.031476 }, { "clip_ratio": 0.0, "completion_length": 343.75, "epoch": 0.25656565656565655, "grad_norm": 1.5813011213273676, "kl": 0.003963836142793298, "learning_rate": 2e-07, "loss": 0.03190605342388153, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.25833333656191826, "reward_std": 0.4000905990600586, "rewards/MultiModalAccuracyORM": 0.25833333656191826, "step": 635, "train_speed(iter/s)": 0.031501 }, { "clip_ratio": 0.0, "completion_length": 325.15, "epoch": 0.2585858585858586, "grad_norm": 0.9990236313380018, "kl": 0.004905425664037466, "learning_rate": 2e-07, "loss": 0.01101228892803192, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.16666667088866233, "reward_std": 0.3167103588581085, "rewards/MultiModalAccuracyORM": 0.16666667088866233, "step": 640, "train_speed(iter/s)": 0.031543 }, { "clip_ratio": 0.0, "completion_length": 280.3, "epoch": 0.2606060606060606, "grad_norm": 0.8236059940973481, "kl": 0.004417796130292117, "learning_rate": 2e-07, "loss": 0.021716611087322236, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.11666666865348815, "reward_std": 0.2551600575447083, "rewards/MultiModalAccuracyORM": 0.11666666865348815, "step": 645, "train_speed(iter/s)": 0.031613 }, { "clip_ratio": 0.0, "completion_length": 368.3, "epoch": 0.26262626262626265, "grad_norm": 1.208141693935316, "kl": 0.0037754237418994308, "learning_rate": 2e-07, "loss": 0.05538809299468994, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.19166667088866235, "reward_std": 0.35232023894786835, "rewards/MultiModalAccuracyORM": 0.19166667088866235, "step": 650, "train_speed(iter/s)": 0.031638 }, { "clip_ratio": 0.0, "completion_length": 318.6, "epoch": 0.26464646464646463, "grad_norm": 0.9531663350118769, "kl": 0.0037441954482346773, "learning_rate": 2e-07, "loss": -0.0026717036962509155, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.12500000223517418, "reward_std": 0.29383077621459963, "rewards/MultiModalAccuracyORM": 0.12500000223517418, "step": 655, "train_speed(iter/s)": 0.031701 }, { "clip_ratio": 0.0, "completion_length": 406.25, "epoch": 0.26666666666666666, "grad_norm": 0.45156032406611, "kl": 0.0036298127146437765, "learning_rate": 2e-07, "loss": 0.004486371576786041, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.1666666693985462, "reward_std": 0.26371566355228426, "rewards/MultiModalAccuracyORM": 0.1666666693985462, "step": 660, "train_speed(iter/s)": 0.031747 }, { "clip_ratio": 0.0, "completion_length": 278.6, "epoch": 0.2686868686868687, "grad_norm": 1.2354602142887612, "kl": 0.005091256252489984, "learning_rate": 2e-07, "loss": -0.01994211971759796, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.1666666679084301, "reward_std": 0.24239750802516938, "rewards/MultiModalAccuracyORM": 0.1666666679084301, "step": 665, "train_speed(iter/s)": 0.031774 }, { "clip_ratio": 0.0, "completion_length": 297.65, "epoch": 0.27070707070707073, "grad_norm": 1.5995488899211916, "kl": 0.004296229011379183, "learning_rate": 2e-07, "loss": -0.02723192870616913, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.3583333417773247, "reward_std": 0.32669364511966703, "rewards/MultiModalAccuracyORM": 0.3583333417773247, "step": 670, "train_speed(iter/s)": 0.031837 }, { "clip_ratio": 0.0, "completion_length": 391.65, "epoch": 0.2727272727272727, "grad_norm": 0.8793351325351834, "kl": 0.003925298724789172, "learning_rate": 2e-07, "loss": 0.01873851418495178, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.22500000819563865, "reward_std": 0.29159853160381316, "rewards/MultiModalAccuracyORM": 0.22500000819563865, "step": 675, "train_speed(iter/s)": 0.031854 }, { "clip_ratio": 0.0, "completion_length": 282.9, "epoch": 0.27474747474747474, "grad_norm": 1.2196841930405988, "kl": 0.0043221796862781044, "learning_rate": 2e-07, "loss": -0.0018929451704025268, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.23333334177732468, "reward_std": 0.36864383816719054, "rewards/MultiModalAccuracyORM": 0.23333334177732468, "step": 680, "train_speed(iter/s)": 0.031875 }, { "clip_ratio": 0.0, "completion_length": 491.4, "epoch": 0.2767676767676768, "grad_norm": 0.578110919194848, "kl": 0.003840261767618358, "learning_rate": 2e-07, "loss": 0.001986941695213318, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.14166666865348815, "reward_std": 0.2719598561525345, "rewards/MultiModalAccuracyORM": 0.14166666865348815, "step": 685, "train_speed(iter/s)": 0.031873 }, { "clip_ratio": 0.0, "completion_length": 250.5, "epoch": 0.2787878787878788, "grad_norm": 1.7045589714757738, "kl": 0.004626664402894676, "learning_rate": 2e-07, "loss": 0.012319982051849365, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.1916666679084301, "reward_std": 0.33984750509262085, "rewards/MultiModalAccuracyORM": 0.1916666679084301, "step": 690, "train_speed(iter/s)": 0.031928 }, { "clip_ratio": 0.0, "completion_length": 321.9, "epoch": 0.2808080808080808, "grad_norm": 1.3591693418225999, "kl": 0.004481176193803549, "learning_rate": 2e-07, "loss": -0.01004476472735405, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.1666666716337204, "reward_std": 0.24560284316539766, "rewards/MultiModalAccuracyORM": 0.1666666716337204, "step": 695, "train_speed(iter/s)": 0.031967 }, { "clip_ratio": 0.0, "completion_length": 222.35, "epoch": 0.2828282828282828, "grad_norm": 0.8510617258462263, "kl": 0.00467616633977741, "learning_rate": 2e-07, "loss": 0.029875683784484863, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.29166667684912684, "reward_std": 0.3390218883752823, "rewards/MultiModalAccuracyORM": 0.29166667684912684, "step": 700, "train_speed(iter/s)": 0.031964 }, { "clip_ratio": 0.0, "completion_length": 264.4, "epoch": 0.28484848484848485, "grad_norm": 1.4378845483189635, "kl": 0.005385997367557138, "learning_rate": 2e-07, "loss": 0.015697968006134034, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.44166668206453324, "reward_std": 0.39862974882125857, "rewards/MultiModalAccuracyORM": 0.44166668206453324, "step": 705, "train_speed(iter/s)": 0.032034 }, { "clip_ratio": 0.0, "completion_length": 223.8, "epoch": 0.2868686868686869, "grad_norm": 1.258370491031708, "kl": 0.0045306324027478695, "learning_rate": 2e-07, "loss": -0.013608846068382262, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.21666667610406876, "reward_std": 0.23224488496780396, "rewards/MultiModalAccuracyORM": 0.21666667610406876, "step": 710, "train_speed(iter/s)": 0.032068 }, { "clip_ratio": 0.0, "completion_length": 308.55, "epoch": 0.28888888888888886, "grad_norm": 2.4400677744024937, "kl": 0.005214189388789236, "learning_rate": 2e-07, "loss": -0.01957079768180847, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.21666667014360427, "reward_std": 0.3274982154369354, "rewards/MultiModalAccuracyORM": 0.21666667014360427, "step": 715, "train_speed(iter/s)": 0.032114 }, { "clip_ratio": 0.0, "completion_length": 379.45, "epoch": 0.2909090909090909, "grad_norm": 1.5673843849964653, "kl": 0.004570033040363341, "learning_rate": 2e-07, "loss": -0.018536585569381713, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.22500000447034835, "reward_std": 0.28472240567207335, "rewards/MultiModalAccuracyORM": 0.22500000447034835, "step": 720, "train_speed(iter/s)": 0.03214 }, { "clip_ratio": 0.0, "completion_length": 304.6, "epoch": 0.29292929292929293, "grad_norm": 0.9935300145018874, "kl": 0.0055370709858834745, "learning_rate": 2e-07, "loss": 0.03206640779972077, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.20000000447034835, "reward_std": 0.3719944924116135, "rewards/MultiModalAccuracyORM": 0.20000000447034835, "step": 725, "train_speed(iter/s)": 0.032199 }, { "clip_ratio": 0.0, "completion_length": 325.3, "epoch": 0.29494949494949496, "grad_norm": 1.7757797332994796, "kl": 0.005048377229832113, "learning_rate": 2e-07, "loss": -0.023146471381187438, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.11666666939854622, "reward_std": 0.277725812792778, "rewards/MultiModalAccuracyORM": 0.11666666939854622, "step": 730, "train_speed(iter/s)": 0.032226 }, { "clip_ratio": 0.0, "completion_length": 422.3, "epoch": 0.296969696969697, "grad_norm": 0.47809660757769357, "kl": 0.006107103615067899, "learning_rate": 2e-07, "loss": -0.03393080234527588, "memory(GiB)": 67.41, "response_clip_ratio": 0.05, "reward": 0.1250000037252903, "reward_std": 0.21374863088130952, "rewards/MultiModalAccuracyORM": 0.1250000037252903, "step": 735, "train_speed(iter/s)": 0.032213 }, { "clip_ratio": 0.0, "completion_length": 285.9, "epoch": 0.298989898989899, "grad_norm": 1.0436844507348098, "kl": 0.004702468903269618, "learning_rate": 2e-07, "loss": -0.021845155954360963, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.22500000670552253, "reward_std": 0.32425729632377626, "rewards/MultiModalAccuracyORM": 0.22500000670552253, "step": 740, "train_speed(iter/s)": 0.032209 }, { "clip_ratio": 0.0, "completion_length": 524.05, "epoch": 0.301010101010101, "grad_norm": 1.049067442546249, "kl": 0.0037612170912325383, "learning_rate": 2e-07, "loss": 0.021030843257904053, "memory(GiB)": 67.41, "response_clip_ratio": 0.1, "reward": 0.15833333656191825, "reward_std": 0.3127004593610764, "rewards/MultiModalAccuracyORM": 0.15833333656191825, "step": 745, "train_speed(iter/s)": 0.032202 }, { "epoch": 0.30303030303030304, "grad_norm": 0.9001583844372758, "learning_rate": 2e-07, "loss": -0.02579028606414795, "memory(GiB)": 67.41, "step": 750, "train_speed(iter/s)": 0.032231 }, { "epoch": 0.30303030303030304, "eval_clip_ratio": 0.0, "eval_completion_length": 340.4366758728027, "eval_kl": 0.004551883968524635, "eval_loss": 0.0018110970268025994, "eval_response_clip_ratio": 0.015000000149011612, "eval_reward": 0.20166667073965072, "eval_reward_std": 0.2683356386423111, "eval_rewards/MultiModalAccuracyORM": 0.20166667073965072, "eval_runtime": 643.2616, "eval_samples_per_second": 0.078, "eval_steps_per_second": 0.008, "step": 750 }, { "clip_ratio": 0.0, "completion_length": 334.525, "epoch": 0.30505050505050507, "grad_norm": 1.176274790320748, "kl": 0.004291673714760691, "learning_rate": 2e-07, "loss": -0.014118121564388275, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.1625000022351742, "reward_std": 0.2608040913939476, "rewards/MultiModalAccuracyORM": 0.1625000022351742, "step": 755, "train_speed(iter/s)": 0.030899 }, { "clip_ratio": 0.0, "completion_length": 435.5, "epoch": 0.30707070707070705, "grad_norm": 1.6795792962236686, "kl": 0.0048645576927810906, "learning_rate": 2e-07, "loss": 0.0353985846042633, "memory(GiB)": 67.41, "response_clip_ratio": 0.05, "reward": 0.14166667088866233, "reward_std": 0.27148365676403047, "rewards/MultiModalAccuracyORM": 0.14166667088866233, "step": 760, "train_speed(iter/s)": 0.030904 }, { "clip_ratio": 0.0, "completion_length": 242.6, "epoch": 0.3090909090909091, "grad_norm": 1.5837851108447996, "kl": 0.005048908712342382, "learning_rate": 2e-07, "loss": 0.0012214839458465575, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.325000012665987, "reward_std": 0.3860262334346771, "rewards/MultiModalAccuracyORM": 0.325000012665987, "step": 765, "train_speed(iter/s)": 0.030943 }, { "clip_ratio": 0.0, "completion_length": 473.85, "epoch": 0.3111111111111111, "grad_norm": 1.1108515543557964, "kl": 0.19770307638682424, "learning_rate": 2e-07, "loss": 0.033725738525390625, "memory(GiB)": 67.41, "response_clip_ratio": 0.05, "reward": 0.2250000022351742, "reward_std": 0.35085399746894835, "rewards/MultiModalAccuracyORM": 0.2250000022351742, "step": 770, "train_speed(iter/s)": 0.030941 }, { "clip_ratio": 0.0, "completion_length": 244.05, "epoch": 0.31313131313131315, "grad_norm": 1.1567261301470195, "kl": 0.005782892415300012, "learning_rate": 2e-07, "loss": 0.011248695850372314, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2166666716337204, "reward_std": 0.344626384973526, "rewards/MultiModalAccuracyORM": 0.2166666716337204, "step": 775, "train_speed(iter/s)": 0.031007 }, { "clip_ratio": 0.0, "completion_length": 363.6, "epoch": 0.3151515151515151, "grad_norm": 0.8996969813314127, "kl": 0.004383829329162836, "learning_rate": 2e-07, "loss": 0.032080155611038205, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.08333333507180214, "reward_std": 0.18106584250926971, "rewards/MultiModalAccuracyORM": 0.08333333507180214, "step": 780, "train_speed(iter/s)": 0.03104 }, { "clip_ratio": 0.0, "completion_length": 335.55, "epoch": 0.31717171717171716, "grad_norm": 0.8366413858450646, "kl": 0.00416933981468901, "learning_rate": 2e-07, "loss": 0.01989735960960388, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.20000000894069672, "reward_std": 0.2323044866323471, "rewards/MultiModalAccuracyORM": 0.20000000894069672, "step": 785, "train_speed(iter/s)": 0.031043 }, { "clip_ratio": 0.0, "completion_length": 352.75, "epoch": 0.3191919191919192, "grad_norm": 1.2338416774729999, "kl": 0.0057875648839399215, "learning_rate": 2e-07, "loss": 0.04425770938396454, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.23333334028720856, "reward_std": 0.2900991141796112, "rewards/MultiModalAccuracyORM": 0.23333334028720856, "step": 790, "train_speed(iter/s)": 0.031075 }, { "clip_ratio": 0.0, "completion_length": 396.55, "epoch": 0.3212121212121212, "grad_norm": 0.6085363702164241, "kl": 0.005086203385144472, "learning_rate": 2e-07, "loss": 0.07262378931045532, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.29166666939854624, "reward_std": 0.39939576387405396, "rewards/MultiModalAccuracyORM": 0.29166666939854624, "step": 795, "train_speed(iter/s)": 0.031082 }, { "clip_ratio": 0.0, "completion_length": 330.05, "epoch": 0.32323232323232326, "grad_norm": 1.710902967582431, "kl": 0.0056509776040911674, "learning_rate": 2e-07, "loss": 0.05898982286453247, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.3666666805744171, "reward_std": 0.40188278555870055, "rewards/MultiModalAccuracyORM": 0.3666666805744171, "step": 800, "train_speed(iter/s)": 0.031108 }, { "clip_ratio": 0.0, "completion_length": 310.25, "epoch": 0.32525252525252524, "grad_norm": 1.4124668864516894, "kl": 0.004830094543285668, "learning_rate": 2e-07, "loss": 0.04988533854484558, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.22500000894069672, "reward_std": 0.3737125337123871, "rewards/MultiModalAccuracyORM": 0.22500000894069672, "step": 805, "train_speed(iter/s)": 0.031118 }, { "clip_ratio": 0.0, "completion_length": 254.2, "epoch": 0.32727272727272727, "grad_norm": 0.8398251404363962, "kl": 0.005614466220140457, "learning_rate": 2e-07, "loss": -0.02921849489212036, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2333333358168602, "reward_std": 0.3322981417179108, "rewards/MultiModalAccuracyORM": 0.2333333358168602, "step": 810, "train_speed(iter/s)": 0.031188 }, { "clip_ratio": 0.0, "completion_length": 291.85, "epoch": 0.3292929292929293, "grad_norm": 1.4062519243001712, "kl": 0.005611171037890017, "learning_rate": 2e-07, "loss": 0.027082645893096925, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.24166667088866234, "reward_std": 0.37593025267124175, "rewards/MultiModalAccuracyORM": 0.24166667088866234, "step": 815, "train_speed(iter/s)": 0.031214 }, { "clip_ratio": 0.0, "completion_length": 244.55, "epoch": 0.33131313131313134, "grad_norm": 1.535732791217238, "kl": 0.006293811020441353, "learning_rate": 2e-07, "loss": -0.004323112964630127, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.19166667014360428, "reward_std": 0.36012140214443206, "rewards/MultiModalAccuracyORM": 0.19166667014360428, "step": 820, "train_speed(iter/s)": 0.031248 }, { "clip_ratio": 0.0, "completion_length": 263.3, "epoch": 0.3333333333333333, "grad_norm": 1.4122524484024275, "kl": 0.005569443246349693, "learning_rate": 2e-07, "loss": 0.03137176036834717, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.15000000223517418, "reward_std": 0.27221408784389495, "rewards/MultiModalAccuracyORM": 0.15000000223517418, "step": 825, "train_speed(iter/s)": 0.031262 }, { "clip_ratio": 0.0, "completion_length": 248.3, "epoch": 0.33535353535353535, "grad_norm": 0.847697597752372, "kl": 0.006561408983543515, "learning_rate": 2e-07, "loss": 0.001510709524154663, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.15833333805203437, "reward_std": 0.3202118068933487, "rewards/MultiModalAccuracyORM": 0.15833333805203437, "step": 830, "train_speed(iter/s)": 0.031276 }, { "clip_ratio": 0.0, "completion_length": 284.1, "epoch": 0.3373737373737374, "grad_norm": 1.0077148325129925, "kl": 0.0051434833323583005, "learning_rate": 2e-07, "loss": -0.046033868193626405, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.22500000819563865, "reward_std": 0.3932794779539108, "rewards/MultiModalAccuracyORM": 0.22500000819563865, "step": 835, "train_speed(iter/s)": 0.031306 }, { "clip_ratio": 0.0, "completion_length": 325.4, "epoch": 0.3393939393939394, "grad_norm": 1.0924856088353982, "kl": 0.006390028609894216, "learning_rate": 2e-07, "loss": 0.017022347450256346, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2666666775941849, "reward_std": 0.34735551476478577, "rewards/MultiModalAccuracyORM": 0.2666666775941849, "step": 840, "train_speed(iter/s)": 0.031335 }, { "clip_ratio": 0.0, "completion_length": 288.1, "epoch": 0.3414141414141414, "grad_norm": 1.2514996858658436, "kl": 0.0053161653922870755, "learning_rate": 2e-07, "loss": 0.017437267303466796, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.25000000819563867, "reward_std": 0.2893422573804855, "rewards/MultiModalAccuracyORM": 0.25000000819563867, "step": 845, "train_speed(iter/s)": 0.031345 }, { "clip_ratio": 0.0, "completion_length": 330.0, "epoch": 0.3434343434343434, "grad_norm": 1.0839879351711734, "kl": 0.005381654878146946, "learning_rate": 2e-07, "loss": 0.013951669633388519, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.13333333730697633, "reward_std": 0.29003951251506804, "rewards/MultiModalAccuracyORM": 0.13333333730697633, "step": 850, "train_speed(iter/s)": 0.031349 }, { "clip_ratio": 0.0, "completion_length": 324.4, "epoch": 0.34545454545454546, "grad_norm": 1.5570693343632969, "kl": 0.008254527021199465, "learning_rate": 2e-07, "loss": -0.0007428258657455444, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.29166667759418485, "reward_std": 0.3415919840335846, "rewards/MultiModalAccuracyORM": 0.29166667759418485, "step": 855, "train_speed(iter/s)": 0.031395 }, { "clip_ratio": 0.0, "completion_length": 190.0, "epoch": 0.3474747474747475, "grad_norm": 1.545061824643743, "kl": 0.006796046695671976, "learning_rate": 2e-07, "loss": 0.01094520315527916, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.27500000819563863, "reward_std": 0.38901292681694033, "rewards/MultiModalAccuracyORM": 0.27500000819563863, "step": 860, "train_speed(iter/s)": 0.031457 }, { "clip_ratio": 0.0, "completion_length": 302.3, "epoch": 0.34949494949494947, "grad_norm": 1.868618439314485, "kl": 0.00691440338268876, "learning_rate": 2e-07, "loss": -0.029064083099365236, "memory(GiB)": 67.41, "response_clip_ratio": 0.05, "reward": 0.3083333410322666, "reward_std": 0.4548985332250595, "rewards/MultiModalAccuracyORM": 0.3083333410322666, "step": 865, "train_speed(iter/s)": 0.031484 }, { "clip_ratio": 0.0, "completion_length": 322.65, "epoch": 0.3515151515151515, "grad_norm": 1.3127307464437807, "kl": 0.005565014760941267, "learning_rate": 2e-07, "loss": 0.019167789816856386, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.1500000037252903, "reward_std": 0.26816859245300295, "rewards/MultiModalAccuracyORM": 0.1500000037252903, "step": 870, "train_speed(iter/s)": 0.031519 }, { "clip_ratio": 0.0, "completion_length": 300.05, "epoch": 0.35353535353535354, "grad_norm": 1.5008568373381221, "kl": 0.005994554329663515, "learning_rate": 2e-07, "loss": 0.023988738656044006, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2750000059604645, "reward_std": 0.38748950958251954, "rewards/MultiModalAccuracyORM": 0.2750000059604645, "step": 875, "train_speed(iter/s)": 0.031537 }, { "clip_ratio": 0.0, "completion_length": 318.35, "epoch": 0.35555555555555557, "grad_norm": 0.8858552851817257, "kl": 0.0050561846233904365, "learning_rate": 2e-07, "loss": 0.001196683943271637, "memory(GiB)": 67.41, "response_clip_ratio": 0.05, "reward": 0.27500000819563863, "reward_std": 0.28755303025245665, "rewards/MultiModalAccuracyORM": 0.27500000819563863, "step": 880, "train_speed(iter/s)": 0.031564 }, { "clip_ratio": 0.0, "completion_length": 213.9, "epoch": 0.3575757575757576, "grad_norm": 0.5620687272407863, "kl": 0.006028561620041728, "learning_rate": 2e-07, "loss": -0.013837304711341859, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2750000014901161, "reward_std": 0.25664491951465607, "rewards/MultiModalAccuracyORM": 0.2750000014901161, "step": 885, "train_speed(iter/s)": 0.031616 }, { "clip_ratio": 0.0, "completion_length": 247.8, "epoch": 0.3595959595959596, "grad_norm": 2.985697887769574, "kl": 0.007074238453060389, "learning_rate": 2e-07, "loss": 0.019273641705513, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.23333333879709245, "reward_std": 0.2681685984134674, "rewards/MultiModalAccuracyORM": 0.23333333879709245, "step": 890, "train_speed(iter/s)": 0.031645 }, { "clip_ratio": 0.0, "completion_length": 549.55, "epoch": 0.3616161616161616, "grad_norm": 1.4558424844518882, "kl": 0.003922113939188421, "learning_rate": 2e-07, "loss": 0.06525606513023377, "memory(GiB)": 67.41, "response_clip_ratio": 0.05, "reward": 0.2500000074505806, "reward_std": 0.3988839745521545, "rewards/MultiModalAccuracyORM": 0.2500000074505806, "step": 895, "train_speed(iter/s)": 0.031624 }, { "clip_ratio": 0.0, "completion_length": 347.4, "epoch": 0.36363636363636365, "grad_norm": 0.656120438814147, "kl": 0.006284803117159754, "learning_rate": 2e-07, "loss": -0.0007577657699584961, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.22500000521540642, "reward_std": 0.25811116099357606, "rewards/MultiModalAccuracyORM": 0.22500000521540642, "step": 900, "train_speed(iter/s)": 0.031648 }, { "clip_ratio": 0.0, "completion_length": 299.0, "epoch": 0.3656565656565657, "grad_norm": 1.5386640162242566, "kl": 0.006484637362882495, "learning_rate": 2e-07, "loss": 0.03939504027366638, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.24166667312383652, "reward_std": 0.2964008718729019, "rewards/MultiModalAccuracyORM": 0.24166667312383652, "step": 905, "train_speed(iter/s)": 0.031684 }, { "clip_ratio": 0.0, "completion_length": 329.45, "epoch": 0.36767676767676766, "grad_norm": 1.636331464172546, "kl": 0.006949460273608566, "learning_rate": 2e-07, "loss": -0.07270481586456298, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.3166666738688946, "reward_std": 0.4011138051748276, "rewards/MultiModalAccuracyORM": 0.3166666738688946, "step": 910, "train_speed(iter/s)": 0.031729 }, { "clip_ratio": 0.0, "completion_length": 261.65, "epoch": 0.3696969696969697, "grad_norm": 0.6925500168851378, "kl": 0.006146807945333422, "learning_rate": 2e-07, "loss": 0.0035164892673492433, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.33333334773778917, "reward_std": 0.32052563428878783, "rewards/MultiModalAccuracyORM": 0.33333334773778917, "step": 915, "train_speed(iter/s)": 0.03175 }, { "clip_ratio": 0.0, "completion_length": 280.1, "epoch": 0.3717171717171717, "grad_norm": 1.8970854173810114, "kl": 0.005729123065248132, "learning_rate": 2e-07, "loss": 0.05737735033035278, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.35000001043081286, "reward_std": 0.4226803660392761, "rewards/MultiModalAccuracyORM": 0.35000001043081286, "step": 920, "train_speed(iter/s)": 0.031768 }, { "clip_ratio": 0.0, "completion_length": 269.85, "epoch": 0.37373737373737376, "grad_norm": 1.1898661364371217, "kl": 0.0061120831873267886, "learning_rate": 2e-07, "loss": 0.011839108169078827, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2333333410322666, "reward_std": 0.3144781023263931, "rewards/MultiModalAccuracyORM": 0.2333333410322666, "step": 925, "train_speed(iter/s)": 0.031823 }, { "clip_ratio": 0.0, "completion_length": 277.4, "epoch": 0.37575757575757573, "grad_norm": 1.3632550964844283, "kl": 0.006334329699166119, "learning_rate": 2e-07, "loss": -0.046709203720092775, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.18333333730697632, "reward_std": 0.21999078691005708, "rewards/MultiModalAccuracyORM": 0.18333333730697632, "step": 930, "train_speed(iter/s)": 0.031864 }, { "clip_ratio": 0.0, "completion_length": 240.85, "epoch": 0.37777777777777777, "grad_norm": 1.0159613386349218, "kl": 0.008165232185274363, "learning_rate": 2e-07, "loss": 0.03819341957569122, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.29166666939854624, "reward_std": 0.27071225047111513, "rewards/MultiModalAccuracyORM": 0.29166666939854624, "step": 935, "train_speed(iter/s)": 0.031907 }, { "clip_ratio": 0.0, "completion_length": 249.2, "epoch": 0.3797979797979798, "grad_norm": 1.7106230008719308, "kl": 0.006773473136126995, "learning_rate": 2e-07, "loss": -0.07135199308395386, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.34166667610406876, "reward_std": 0.287842845916748, "rewards/MultiModalAccuracyORM": 0.34166667610406876, "step": 940, "train_speed(iter/s)": 0.031944 }, { "clip_ratio": 0.0, "completion_length": 311.0, "epoch": 0.38181818181818183, "grad_norm": 0.9204904271016048, "kl": 0.00620469048153609, "learning_rate": 2e-07, "loss": 0.0036203682422637938, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.30000000447034836, "reward_std": 0.33299540281295775, "rewards/MultiModalAccuracyORM": 0.30000000447034836, "step": 945, "train_speed(iter/s)": 0.031985 }, { "clip_ratio": 0.0, "completion_length": 321.5, "epoch": 0.3838383838383838, "grad_norm": 1.9449992630577924, "kl": 0.0057474728906527165, "learning_rate": 2e-07, "loss": 0.010283425450325012, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.3916666693985462, "reward_std": 0.28456337153911593, "rewards/MultiModalAccuracyORM": 0.3916666693985462, "step": 950, "train_speed(iter/s)": 0.031997 }, { "clip_ratio": 0.0, "completion_length": 354.65, "epoch": 0.38585858585858585, "grad_norm": 1.1872114495400206, "kl": 0.0066348537104204295, "learning_rate": 2e-07, "loss": -0.08897682428359985, "memory(GiB)": 67.41, "response_clip_ratio": 0.05, "reward": 0.30833334252238276, "reward_std": 0.3064227133989334, "rewards/MultiModalAccuracyORM": 0.30833334252238276, "step": 955, "train_speed(iter/s)": 0.031973 }, { "clip_ratio": 0.0, "completion_length": 293.85, "epoch": 0.3878787878787879, "grad_norm": 1.4160066184361069, "kl": 0.00627009014133364, "learning_rate": 2e-07, "loss": 0.024894729256629944, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.27500000819563863, "reward_std": 0.33303396999835966, "rewards/MultiModalAccuracyORM": 0.27500000819563863, "step": 960, "train_speed(iter/s)": 0.031975 }, { "clip_ratio": 0.0, "completion_length": 363.55, "epoch": 0.3898989898989899, "grad_norm": 1.2362614663841194, "kl": 0.006092234468087554, "learning_rate": 2e-07, "loss": 0.0033513441681861877, "memory(GiB)": 67.41, "response_clip_ratio": 0.05, "reward": 0.33333334103226664, "reward_std": 0.3174647957086563, "rewards/MultiModalAccuracyORM": 0.33333334103226664, "step": 965, "train_speed(iter/s)": 0.031996 }, { "clip_ratio": 0.0, "completion_length": 294.45, "epoch": 0.39191919191919194, "grad_norm": 0.6212974432181537, "kl": 0.005540155991911888, "learning_rate": 2e-07, "loss": 0.0035649120807647707, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.18333333879709243, "reward_std": 0.24615318179130555, "rewards/MultiModalAccuracyORM": 0.18333333879709243, "step": 970, "train_speed(iter/s)": 0.031961 }, { "clip_ratio": 0.0, "completion_length": 295.1, "epoch": 0.3939393939393939, "grad_norm": 0.5947339601867617, "kl": 0.005397630413062871, "learning_rate": 2e-07, "loss": -0.07813270688056946, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.17500000521540643, "reward_std": 0.30191018283367155, "rewards/MultiModalAccuracyORM": 0.17500000521540643, "step": 975, "train_speed(iter/s)": 0.031954 }, { "clip_ratio": 0.0, "completion_length": 308.25, "epoch": 0.39595959595959596, "grad_norm": 0.5020572049064443, "kl": 0.005718397395685315, "learning_rate": 2e-07, "loss": 0.02026384472846985, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.12500000298023223, "reward_std": 0.1911232739686966, "rewards/MultiModalAccuracyORM": 0.12500000298023223, "step": 980, "train_speed(iter/s)": 0.031951 }, { "clip_ratio": 0.0, "completion_length": 344.65, "epoch": 0.397979797979798, "grad_norm": 0.9401973082771917, "kl": 0.006880732695572078, "learning_rate": 2e-07, "loss": 0.033180487155914304, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.16666667386889458, "reward_std": 0.2074468731880188, "rewards/MultiModalAccuracyORM": 0.16666667386889458, "step": 985, "train_speed(iter/s)": 0.031964 }, { "clip_ratio": 0.0, "completion_length": 494.3, "epoch": 0.4, "grad_norm": 83.40224473063842, "kl": 0.12589137610048057, "learning_rate": 2e-07, "loss": 0.013488560914993286, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.15000000447034836, "reward_std": 0.2832947254180908, "rewards/MultiModalAccuracyORM": 0.15000000447034836, "step": 990, "train_speed(iter/s)": 0.031966 }, { "clip_ratio": 0.0, "completion_length": 366.9, "epoch": 0.402020202020202, "grad_norm": 1.4324831057895324, "kl": 0.0055825527058914306, "learning_rate": 2e-07, "loss": -0.03936474025249481, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2583333395421505, "reward_std": 0.4111736625432968, "rewards/MultiModalAccuracyORM": 0.2583333395421505, "step": 995, "train_speed(iter/s)": 0.03201 }, { "epoch": 0.40404040404040403, "grad_norm": 1.2517397120109381, "learning_rate": 2e-07, "loss": -0.015787112712860107, "memory(GiB)": 67.41, "step": 1000, "train_speed(iter/s)": 0.031987 }, { "epoch": 0.40404040404040403, "eval_clip_ratio": 0.0, "eval_completion_length": 325.82667709350585, "eval_kl": 0.005815695002675056, "eval_loss": 0.004047422204166651, "eval_response_clip_ratio": 0.001666666716337204, "eval_reward": 0.22833334043622017, "eval_reward_std": 0.31840195894241335, "eval_rewards/MultiModalAccuracyORM": 0.22833334043622017, "eval_runtime": 636.093, "eval_samples_per_second": 0.079, "eval_steps_per_second": 0.008, "step": 1000 }, { "clip_ratio": 0.0, "completion_length": 411.1, "epoch": 0.40606060606060607, "grad_norm": 0.013788807580567233, "kl": 0.0054129053140059115, "learning_rate": 2e-07, "loss": 0.0037709444761276243, "memory(GiB)": 67.41, "response_clip_ratio": 0.025, "reward": 0.1833333358168602, "reward_std": 0.3083831608295441, "rewards/MultiModalAccuracyORM": 0.1833333358168602, "step": 1005, "train_speed(iter/s)": 0.030995 }, { "clip_ratio": 0.0, "completion_length": 288.5, "epoch": 0.4080808080808081, "grad_norm": 0.7274819878482078, "kl": 0.00596827978733927, "learning_rate": 2e-07, "loss": 0.05422252416610718, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.21666667088866234, "reward_std": 0.3408707112073898, "rewards/MultiModalAccuracyORM": 0.21666667088866234, "step": 1010, "train_speed(iter/s)": 0.031034 }, { "clip_ratio": 0.0, "completion_length": 278.4, "epoch": 0.4101010101010101, "grad_norm": 0.48477183540520113, "kl": 0.0054684164701029655, "learning_rate": 2e-07, "loss": 0.037825629115104675, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.3000000096857548, "reward_std": 0.2979002833366394, "rewards/MultiModalAccuracyORM": 0.3000000096857548, "step": 1015, "train_speed(iter/s)": 0.031065 }, { "clip_ratio": 0.0, "completion_length": 355.4, "epoch": 0.4121212121212121, "grad_norm": 2.4295423623484362, "kl": 0.005641359637957066, "learning_rate": 2e-07, "loss": -0.046464985609054564, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.1500000037252903, "reward_std": 0.16470665335655213, "rewards/MultiModalAccuracyORM": 0.1500000037252903, "step": 1020, "train_speed(iter/s)": 0.031083 }, { "clip_ratio": 0.0, "completion_length": 328.6, "epoch": 0.41414141414141414, "grad_norm": 1.2390331767029386, "kl": 0.005399754224345088, "learning_rate": 2e-07, "loss": 0.030136501789093016, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.10833333656191826, "reward_std": 0.20343697369098662, "rewards/MultiModalAccuracyORM": 0.10833333656191826, "step": 1025, "train_speed(iter/s)": 0.031111 }, { "clip_ratio": 0.0, "completion_length": 332.25, "epoch": 0.4161616161616162, "grad_norm": 0.9468249386621901, "kl": 0.006285157660022378, "learning_rate": 2e-07, "loss": 0.023849096894264222, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.1833333432674408, "reward_std": 0.2488823115825653, "rewards/MultiModalAccuracyORM": 0.1833333432674408, "step": 1030, "train_speed(iter/s)": 0.031124 }, { "clip_ratio": 0.0, "completion_length": 374.6, "epoch": 0.41818181818181815, "grad_norm": 1.1115466247036063, "kl": 0.004610971501097083, "learning_rate": 2e-07, "loss": 0.0053513914346694945, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.24166667386889457, "reward_std": 0.23930107951164245, "rewards/MultiModalAccuracyORM": 0.24166667386889457, "step": 1035, "train_speed(iter/s)": 0.031106 }, { "clip_ratio": 0.0, "completion_length": 351.1, "epoch": 0.4202020202020202, "grad_norm": 0.02105150606730856, "kl": 0.006059326883405447, "learning_rate": 2e-07, "loss": 0.003586888313293457, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.33333334550261495, "reward_std": 0.324789759516716, "rewards/MultiModalAccuracyORM": 0.33333334550261495, "step": 1040, "train_speed(iter/s)": 0.031118 }, { "clip_ratio": 0.0, "completion_length": 417.0, "epoch": 0.4222222222222222, "grad_norm": 1.3465295426814468, "kl": 0.005017468379810453, "learning_rate": 2e-07, "loss": 0.01884058117866516, "memory(GiB)": 67.41, "response_clip_ratio": 0.05, "reward": 0.15833333730697632, "reward_std": 0.33526621460914613, "rewards/MultiModalAccuracyORM": 0.15833333730697632, "step": 1045, "train_speed(iter/s)": 0.031118 }, { "clip_ratio": 0.0, "completion_length": 385.25, "epoch": 0.42424242424242425, "grad_norm": 0.01829834451184037, "kl": 0.00570887109497562, "learning_rate": 2e-07, "loss": -0.04955781400203705, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.17500000149011613, "reward_std": 0.1808116167783737, "rewards/MultiModalAccuracyORM": 0.17500000149011613, "step": 1050, "train_speed(iter/s)": 0.031106 }, { "clip_ratio": 0.0, "completion_length": 294.55, "epoch": 0.4262626262626263, "grad_norm": 1.1139595382513947, "kl": 0.0063067243434488775, "learning_rate": 2e-07, "loss": 0.037534278631210324, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.21666666939854623, "reward_std": 0.2923289448022842, "rewards/MultiModalAccuracyORM": 0.21666666939854623, "step": 1055, "train_speed(iter/s)": 0.031146 }, { "clip_ratio": 0.0, "completion_length": 310.5, "epoch": 0.42828282828282827, "grad_norm": 1.2445691938767505, "kl": 0.006325511611066759, "learning_rate": 2e-07, "loss": -0.044334182143211366, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.22500000447034835, "reward_std": 0.3978011727333069, "rewards/MultiModalAccuracyORM": 0.22500000447034835, "step": 1060, "train_speed(iter/s)": 0.031171 }, { "clip_ratio": 0.0, "completion_length": 303.7, "epoch": 0.4303030303030303, "grad_norm": 0.9566673579166692, "kl": 0.0070721972035244106, "learning_rate": 2e-07, "loss": -0.005390632152557373, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.3250000081956387, "reward_std": 0.3438218146562576, "rewards/MultiModalAccuracyORM": 0.3250000081956387, "step": 1065, "train_speed(iter/s)": 0.031187 }, { "clip_ratio": 0.0, "completion_length": 349.65, "epoch": 0.43232323232323233, "grad_norm": 0.6080597174926101, "kl": 0.005873536411672831, "learning_rate": 2e-07, "loss": 0.02115156948566437, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.3416666783392429, "reward_std": 0.2885376811027527, "rewards/MultiModalAccuracyORM": 0.3416666783392429, "step": 1070, "train_speed(iter/s)": 0.03122 }, { "clip_ratio": 0.0, "completion_length": 388.05, "epoch": 0.43434343434343436, "grad_norm": 1.1890985376722285, "kl": 0.005225225887261331, "learning_rate": 2e-07, "loss": 0.033620885014533995, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.1666666716337204, "reward_std": 0.2730426698923111, "rewards/MultiModalAccuracyORM": 0.1666666716337204, "step": 1075, "train_speed(iter/s)": 0.03125 }, { "clip_ratio": 0.0, "completion_length": 257.8, "epoch": 0.43636363636363634, "grad_norm": 0.9920368386170019, "kl": 0.007723887427709996, "learning_rate": 2e-07, "loss": -0.01428629457950592, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.3166666738688946, "reward_std": 0.3488905102014542, "rewards/MultiModalAccuracyORM": 0.3166666738688946, "step": 1080, "train_speed(iter/s)": 0.031278 }, { "clip_ratio": 0.0, "completion_length": 299.95, "epoch": 0.4383838383838384, "grad_norm": 0.8633228611517588, "kl": 0.006215728004463017, "learning_rate": 2e-07, "loss": 0.009860965609550475, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.10833333879709243, "reward_std": 0.20588786602020265, "rewards/MultiModalAccuracyORM": 0.10833333879709243, "step": 1085, "train_speed(iter/s)": 0.031299 }, { "clip_ratio": 0.0, "completion_length": 249.4, "epoch": 0.4404040404040404, "grad_norm": 1.1078043273889853, "kl": 0.008284115185961127, "learning_rate": 2e-07, "loss": -0.027253830432891847, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.20833334028720857, "reward_std": 0.31467272639274596, "rewards/MultiModalAccuracyORM": 0.20833334028720857, "step": 1090, "train_speed(iter/s)": 0.031335 }, { "clip_ratio": 0.0, "completion_length": 301.9, "epoch": 0.44242424242424244, "grad_norm": 1.3316514075181503, "kl": 0.007131563685834408, "learning_rate": 2e-07, "loss": 0.01606808602809906, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.39166667461395266, "reward_std": 0.3697382241487503, "rewards/MultiModalAccuracyORM": 0.39166667461395266, "step": 1095, "train_speed(iter/s)": 0.031356 }, { "clip_ratio": 0.0, "completion_length": 277.3, "epoch": 0.4444444444444444, "grad_norm": 1.5007656151975992, "kl": 0.005109827104024589, "learning_rate": 2e-07, "loss": 0.012760597467422485, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.30000000819563866, "reward_std": 0.4181702554225922, "rewards/MultiModalAccuracyORM": 0.30000000819563866, "step": 1100, "train_speed(iter/s)": 0.031388 }, { "clip_ratio": 0.0, "completion_length": 269.8, "epoch": 0.44646464646464645, "grad_norm": 1.1822162392393358, "kl": 0.006911608600057661, "learning_rate": 2e-07, "loss": -0.004604104161262512, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2583333380520344, "reward_std": 0.37523541152477263, "rewards/MultiModalAccuracyORM": 0.2583333380520344, "step": 1105, "train_speed(iter/s)": 0.031425 }, { "clip_ratio": 0.0, "completion_length": 233.1, "epoch": 0.4484848484848485, "grad_norm": 1.0513525935612356, "kl": 0.007250142516568303, "learning_rate": 2e-07, "loss": 0.011294081062078475, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.26666667088866236, "reward_std": 0.2692273885011673, "rewards/MultiModalAccuracyORM": 0.26666667088866236, "step": 1110, "train_speed(iter/s)": 0.031461 }, { "clip_ratio": 0.0, "completion_length": 336.4, "epoch": 0.4505050505050505, "grad_norm": 0.588254196095547, "kl": 0.0058827483095228675, "learning_rate": 2e-07, "loss": 0.008113735914230346, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.1083333358168602, "reward_std": 0.24885829985141755, "rewards/MultiModalAccuracyORM": 0.1083333358168602, "step": 1115, "train_speed(iter/s)": 0.031497 }, { "clip_ratio": 0.0, "completion_length": 344.8, "epoch": 0.45252525252525255, "grad_norm": 0.9984948999076526, "kl": 0.007338272430934012, "learning_rate": 2e-07, "loss": 0.05758116841316223, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.3166666753590107, "reward_std": 0.3659469664096832, "rewards/MultiModalAccuracyORM": 0.3166666753590107, "step": 1120, "train_speed(iter/s)": 0.031502 }, { "clip_ratio": 0.0, "completion_length": 368.15, "epoch": 0.45454545454545453, "grad_norm": 0.509714512716735, "kl": 0.0060618318850174545, "learning_rate": 2e-07, "loss": -0.0008696913719177246, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.43333335071802137, "reward_std": 0.4385633558034897, "rewards/MultiModalAccuracyORM": 0.43333335071802137, "step": 1125, "train_speed(iter/s)": 0.0315 }, { "clip_ratio": 0.0, "completion_length": 237.15, "epoch": 0.45656565656565656, "grad_norm": 12.152246394116803, "kl": 0.010060751531273126, "learning_rate": 2e-07, "loss": 0.044137763977050784, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.26666667461395266, "reward_std": 0.3440760403871536, "rewards/MultiModalAccuracyORM": 0.26666667461395266, "step": 1130, "train_speed(iter/s)": 0.031529 }, { "clip_ratio": 0.0, "completion_length": 423.0, "epoch": 0.4585858585858586, "grad_norm": 0.9173177729995868, "kl": 0.005417682533152402, "learning_rate": 2e-07, "loss": -0.0018961310386657714, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.07500000298023224, "reward_std": 0.1481528401374817, "rewards/MultiModalAccuracyORM": 0.07500000298023224, "step": 1135, "train_speed(iter/s)": 0.031537 }, { "clip_ratio": 0.0, "completion_length": 290.7, "epoch": 0.46060606060606063, "grad_norm": 1.2629928855399732, "kl": 0.007898857281543315, "learning_rate": 2e-07, "loss": 0.0265865683555603, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.23333333879709245, "reward_std": 0.28697867393493653, "rewards/MultiModalAccuracyORM": 0.23333333879709245, "step": 1140, "train_speed(iter/s)": 0.031512 }, { "clip_ratio": 0.0, "completion_length": 406.35, "epoch": 0.4626262626262626, "grad_norm": 0.012765946285130914, "kl": 0.005864207935519517, "learning_rate": 2e-07, "loss": 0.005378928780555725, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.10833333507180214, "reward_std": 0.22629254460334777, "rewards/MultiModalAccuracyORM": 0.10833333507180214, "step": 1145, "train_speed(iter/s)": 0.031533 }, { "clip_ratio": 0.0, "completion_length": 371.75, "epoch": 0.46464646464646464, "grad_norm": 1.2497788736637212, "kl": 0.00878450043965131, "learning_rate": 2e-07, "loss": 0.04154196977615356, "memory(GiB)": 67.41, "response_clip_ratio": 0.05, "reward": 0.2583333417773247, "reward_std": 0.33478758931159974, "rewards/MultiModalAccuracyORM": 0.2583333417773247, "step": 1150, "train_speed(iter/s)": 0.031545 }, { "clip_ratio": 0.0, "completion_length": 328.6, "epoch": 0.4666666666666667, "grad_norm": 1.144170406383162, "kl": 0.00682174377143383, "learning_rate": 2e-07, "loss": 0.054303860664367674, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.291666679084301, "reward_std": 0.31667476892471313, "rewards/MultiModalAccuracyORM": 0.291666679084301, "step": 1155, "train_speed(iter/s)": 0.031578 }, { "clip_ratio": 0.0, "completion_length": 270.75, "epoch": 0.4686868686868687, "grad_norm": 1.706239065161423, "kl": 0.007611270109191537, "learning_rate": 2e-07, "loss": 0.05665465593338013, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.3416666738688946, "reward_std": 0.3656185895204544, "rewards/MultiModalAccuracyORM": 0.3416666738688946, "step": 1160, "train_speed(iter/s)": 0.031587 }, { "clip_ratio": 0.0, "completion_length": 275.9, "epoch": 0.4707070707070707, "grad_norm": 0.6591730740554306, "kl": 0.006474771653302014, "learning_rate": 2e-07, "loss": 0.0037678249180316927, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.13333333730697633, "reward_std": 0.24255654215812683, "rewards/MultiModalAccuracyORM": 0.13333333730697633, "step": 1165, "train_speed(iter/s)": 0.031615 }, { "clip_ratio": 0.0, "completion_length": 347.85, "epoch": 0.4727272727272727, "grad_norm": 0.8135071730864046, "kl": 0.007919127470813692, "learning_rate": 2e-07, "loss": 0.042950406670570374, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.33333334177732465, "reward_std": 0.4546443074941635, "rewards/MultiModalAccuracyORM": 0.33333334177732465, "step": 1170, "train_speed(iter/s)": 0.031644 }, { "clip_ratio": 0.0, "completion_length": 388.35, "epoch": 0.47474747474747475, "grad_norm": 0.735952514646633, "kl": 0.00741737331263721, "learning_rate": 2e-07, "loss": -0.020889997482299805, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.22500000447034835, "reward_std": 0.34076098203659055, "rewards/MultiModalAccuracyORM": 0.22500000447034835, "step": 1175, "train_speed(iter/s)": 0.031661 }, { "clip_ratio": 0.0, "completion_length": 373.25, "epoch": 0.4767676767676768, "grad_norm": 2.0420078916899143, "kl": 0.007773328572511673, "learning_rate": 2e-07, "loss": -0.020136108994483946, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.21666667386889457, "reward_std": 0.37396675944328306, "rewards/MultiModalAccuracyORM": 0.21666667386889457, "step": 1180, "train_speed(iter/s)": 0.031682 }, { "clip_ratio": 0.0, "completion_length": 426.3, "epoch": 0.47878787878787876, "grad_norm": 1.50872882008361, "kl": 0.006335928500629961, "learning_rate": 2e-07, "loss": 0.06880509257316589, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.27500001043081285, "reward_std": 0.3852572590112686, "rewards/MultiModalAccuracyORM": 0.27500001043081285, "step": 1185, "train_speed(iter/s)": 0.031678 }, { "clip_ratio": 0.0, "completion_length": 358.45, "epoch": 0.4808080808080808, "grad_norm": 0.7395112570215242, "kl": 0.007393318344838917, "learning_rate": 2e-07, "loss": 0.02349342405796051, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.1666666731238365, "reward_std": 0.2574403375387192, "rewards/MultiModalAccuracyORM": 0.1666666731238365, "step": 1190, "train_speed(iter/s)": 0.031694 }, { "clip_ratio": 0.0, "completion_length": 347.9, "epoch": 0.48282828282828283, "grad_norm": 0.8770660085210343, "kl": 0.008547824015840888, "learning_rate": 2e-07, "loss": 0.00840257853269577, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2583333380520344, "reward_std": 0.3003867596387863, "rewards/MultiModalAccuracyORM": 0.2583333380520344, "step": 1195, "train_speed(iter/s)": 0.031694 }, { "clip_ratio": 0.0, "completion_length": 248.55, "epoch": 0.48484848484848486, "grad_norm": 4.837760223432283, "kl": 0.007676198193803429, "learning_rate": 2e-07, "loss": 0.04577964842319489, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.27500000298023225, "reward_std": 0.41469616293907163, "rewards/MultiModalAccuracyORM": 0.27500000298023225, "step": 1200, "train_speed(iter/s)": 0.031731 }, { "clip_ratio": 0.0, "completion_length": 370.1, "epoch": 0.4868686868686869, "grad_norm": 0.8131117130602656, "kl": 0.006847620429471135, "learning_rate": 2e-07, "loss": 0.038221675157547, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.46666667610406876, "reward_std": 0.44790194034576414, "rewards/MultiModalAccuracyORM": 0.46666667610406876, "step": 1205, "train_speed(iter/s)": 0.031756 }, { "clip_ratio": 0.0, "completion_length": 214.2, "epoch": 0.4888888888888889, "grad_norm": 1.0083613540747984, "kl": 0.009360355604439975, "learning_rate": 2e-07, "loss": 0.04207033514976501, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.44166667461395265, "reward_std": 0.30734776258468627, "rewards/MultiModalAccuracyORM": 0.44166667461395265, "step": 1210, "train_speed(iter/s)": 0.031802 }, { "clip_ratio": 0.0, "completion_length": 370.75, "epoch": 0.4909090909090909, "grad_norm": 0.8535636422021001, "kl": 0.007888032216578722, "learning_rate": 2e-07, "loss": 0.02074309587478638, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.3750000029802322, "reward_std": 0.36721318662166597, "rewards/MultiModalAccuracyORM": 0.3750000029802322, "step": 1215, "train_speed(iter/s)": 0.031817 }, { "clip_ratio": 0.0, "completion_length": 301.45, "epoch": 0.49292929292929294, "grad_norm": 1.1747245735311718, "kl": 0.005809159600175917, "learning_rate": 2e-07, "loss": 0.030560284852981567, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.10000000223517418, "reward_std": 0.24860407412052155, "rewards/MultiModalAccuracyORM": 0.10000000223517418, "step": 1220, "train_speed(iter/s)": 0.031788 }, { "clip_ratio": 0.0, "completion_length": 316.7, "epoch": 0.494949494949495, "grad_norm": 1.3342180404809851, "kl": 0.007361576543189585, "learning_rate": 2e-07, "loss": 0.005729189515113831, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.3666666805744171, "reward_std": 0.42749726176261904, "rewards/MultiModalAccuracyORM": 0.3666666805744171, "step": 1225, "train_speed(iter/s)": 0.03183 }, { "clip_ratio": 0.0, "completion_length": 264.4, "epoch": 0.49696969696969695, "grad_norm": 1.3354967765672678, "kl": 0.0077354055363684894, "learning_rate": 2e-07, "loss": 0.011936230957508088, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2500000037252903, "reward_std": 0.28934225142002107, "rewards/MultiModalAccuracyORM": 0.2500000037252903, "step": 1230, "train_speed(iter/s)": 0.031862 }, { "clip_ratio": 0.0, "completion_length": 372.85, "epoch": 0.498989898989899, "grad_norm": 1.7029900631069643, "kl": 0.009492517588660121, "learning_rate": 2e-07, "loss": 0.026520213484764098, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.21666667088866234, "reward_std": 0.39930057227611543, "rewards/MultiModalAccuracyORM": 0.21666667088866234, "step": 1235, "train_speed(iter/s)": 0.03189 }, { "clip_ratio": 0.0, "completion_length": 437.5, "epoch": 0.501010101010101, "grad_norm": 0.8836311467778365, "kl": 0.007044275873340666, "learning_rate": 2e-07, "loss": 0.02124674618244171, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.23333334177732468, "reward_std": 0.26518189907073975, "rewards/MultiModalAccuracyORM": 0.23333334177732468, "step": 1240, "train_speed(iter/s)": 0.031898 }, { "clip_ratio": 0.0, "completion_length": 456.55, "epoch": 0.503030303030303, "grad_norm": 0.713932604855569, "kl": 0.007297229184769094, "learning_rate": 2e-07, "loss": -0.049902528524398804, "memory(GiB)": 67.41, "response_clip_ratio": 0.05, "reward": 0.21666667535901069, "reward_std": 0.3636551022529602, "rewards/MultiModalAccuracyORM": 0.21666667535901069, "step": 1245, "train_speed(iter/s)": 0.031896 }, { "epoch": 0.5050505050505051, "grad_norm": 1.0772405355167842, "learning_rate": 2e-07, "loss": 0.023000609874725342, "memory(GiB)": 67.41, "step": 1250, "train_speed(iter/s)": 0.031918 }, { "epoch": 0.5050505050505051, "eval_clip_ratio": 0.0, "eval_completion_length": 317.3300076293945, "eval_kl": 0.008607916957698762, "eval_loss": 0.04203889146447182, "eval_response_clip_ratio": 0.005000000149011612, "eval_reward": 0.26166667401790616, "eval_reward_std": 0.33101949989795687, "eval_rewards/MultiModalAccuracyORM": 0.26166667401790616, "eval_runtime": 649.7206, "eval_samples_per_second": 0.077, "eval_steps_per_second": 0.008, "step": 1250 }, { "clip_ratio": 0.0, "completion_length": 386.7, "epoch": 0.5070707070707071, "grad_norm": 0.8869623944003157, "kl": 0.007809204491786658, "learning_rate": 2e-07, "loss": 0.08510025143623352, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.3166666753590107, "reward_std": 0.3126992493867874, "rewards/MultiModalAccuracyORM": 0.3166666753590107, "step": 1255, "train_speed(iter/s)": 0.031124 }, { "clip_ratio": 0.0, "completion_length": 239.6, "epoch": 0.509090909090909, "grad_norm": 1.5782361460355463, "kl": 0.009718046616762876, "learning_rate": 2e-07, "loss": 0.06461310386657715, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.3416666783392429, "reward_std": 0.39375568330287936, "rewards/MultiModalAccuracyORM": 0.3416666783392429, "step": 1260, "train_speed(iter/s)": 0.031165 }, { "clip_ratio": 0.0, "completion_length": 309.85, "epoch": 0.5111111111111111, "grad_norm": 2.199476894866435, "kl": 0.007806334691122174, "learning_rate": 2e-07, "loss": 0.006014569103717804, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.22500000149011612, "reward_std": 0.24265173375606536, "rewards/MultiModalAccuracyORM": 0.22500000149011612, "step": 1265, "train_speed(iter/s)": 0.031162 }, { "clip_ratio": 0.0, "completion_length": 333.35, "epoch": 0.5131313131313131, "grad_norm": 0.7012273951483338, "kl": 0.009944566525518894, "learning_rate": 2e-07, "loss": 0.08870444297790528, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.39166668206453326, "reward_std": 0.407406410574913, "rewards/MultiModalAccuracyORM": 0.39166668206453326, "step": 1270, "train_speed(iter/s)": 0.031181 }, { "clip_ratio": 0.0, "completion_length": 241.9, "epoch": 0.5151515151515151, "grad_norm": 0.8594993279946802, "kl": 0.009327950514853, "learning_rate": 2e-07, "loss": -0.0274441659450531, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.31666667610406873, "reward_std": 0.38369824588298795, "rewards/MultiModalAccuracyORM": 0.31666667610406873, "step": 1275, "train_speed(iter/s)": 0.031192 }, { "clip_ratio": 0.0, "completion_length": 263.5, "epoch": 0.5171717171717172, "grad_norm": 1.605471507408993, "kl": 0.008260847954079508, "learning_rate": 2e-07, "loss": 0.01983429193496704, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.1500000037252903, "reward_std": 0.21149236261844634, "rewards/MultiModalAccuracyORM": 0.1500000037252903, "step": 1280, "train_speed(iter/s)": 0.03122 }, { "clip_ratio": 0.0, "completion_length": 268.75, "epoch": 0.5191919191919192, "grad_norm": 0.7970718049407819, "kl": 0.009291452821344137, "learning_rate": 2e-07, "loss": -0.0878964126110077, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.24166667461395264, "reward_std": 0.3267081886529922, "rewards/MultiModalAccuracyORM": 0.24166667461395264, "step": 1285, "train_speed(iter/s)": 0.031244 }, { "clip_ratio": 0.0, "completion_length": 272.7, "epoch": 0.5212121212121212, "grad_norm": 0.03636319780256794, "kl": 0.010186967998743057, "learning_rate": 2e-07, "loss": 0.014943599700927734, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.14166666865348815, "reward_std": 0.2621566504240036, "rewards/MultiModalAccuracyORM": 0.14166666865348815, "step": 1290, "train_speed(iter/s)": 0.031292 }, { "clip_ratio": 0.0, "completion_length": 350.85, "epoch": 0.5232323232323233, "grad_norm": 0.5560985576302439, "kl": 0.008923888113349676, "learning_rate": 2e-07, "loss": 0.01636694073677063, "memory(GiB)": 67.41, "response_clip_ratio": 0.05, "reward": 0.3833333387970924, "reward_std": 0.379781112074852, "rewards/MultiModalAccuracyORM": 0.3833333387970924, "step": 1295, "train_speed(iter/s)": 0.031311 }, { "clip_ratio": 0.0, "completion_length": 347.65, "epoch": 0.5252525252525253, "grad_norm": 1.0969162898318026, "kl": 0.007647776743397117, "learning_rate": 2e-07, "loss": 0.011600933969020844, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2916666679084301, "reward_std": 0.33984750509262085, "rewards/MultiModalAccuracyORM": 0.2916666679084301, "step": 1300, "train_speed(iter/s)": 0.031311 }, { "clip_ratio": 0.0, "completion_length": 333.3, "epoch": 0.5272727272727272, "grad_norm": 1.0609203809073788, "kl": 0.009041132358834147, "learning_rate": 2e-07, "loss": 0.05796287655830383, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.37500000968575475, "reward_std": 0.395568910241127, "rewards/MultiModalAccuracyORM": 0.37500000968575475, "step": 1305, "train_speed(iter/s)": 0.031317 }, { "clip_ratio": 0.0, "completion_length": 307.35, "epoch": 0.5292929292929293, "grad_norm": 1.2425047304213874, "kl": 0.0067569724516943095, "learning_rate": 2e-07, "loss": 0.0740867018699646, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.30000000447034836, "reward_std": 0.35792473554611204, "rewards/MultiModalAccuracyORM": 0.30000000447034836, "step": 1310, "train_speed(iter/s)": 0.031328 }, { "clip_ratio": 0.0, "completion_length": 379.15, "epoch": 0.5313131313131313, "grad_norm": 0.9808794955952097, "kl": 0.010406963923014701, "learning_rate": 2e-07, "loss": 0.01893787384033203, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.26666667237877845, "reward_std": 0.3398119151592255, "rewards/MultiModalAccuracyORM": 0.26666667237877845, "step": 1315, "train_speed(iter/s)": 0.031339 }, { "clip_ratio": 0.0, "completion_length": 182.9, "epoch": 0.5333333333333333, "grad_norm": 1.2456162412938805, "kl": 0.011189991328865289, "learning_rate": 2e-07, "loss": -0.014865723252296448, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.29166667237877847, "reward_std": 0.29258317649364474, "rewards/MultiModalAccuracyORM": 0.29166667237877847, "step": 1320, "train_speed(iter/s)": 0.03136 }, { "clip_ratio": 0.0, "completion_length": 253.65, "epoch": 0.5353535353535354, "grad_norm": 2.0692124425676828, "kl": 0.008971794368699193, "learning_rate": 2e-07, "loss": -0.0039320230484008786, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.3000000089406967, "reward_std": 0.350342208147049, "rewards/MultiModalAccuracyORM": 0.3000000089406967, "step": 1325, "train_speed(iter/s)": 0.03138 }, { "clip_ratio": 0.0, "completion_length": 339.95, "epoch": 0.5373737373737374, "grad_norm": 0.019727773431517187, "kl": 0.008127374900504946, "learning_rate": 2e-07, "loss": 0.014944207668304444, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.27500000819563863, "reward_std": 0.2855509877204895, "rewards/MultiModalAccuracyORM": 0.27500000819563863, "step": 1330, "train_speed(iter/s)": 0.031384 }, { "clip_ratio": 0.0, "completion_length": 285.5, "epoch": 0.5393939393939394, "grad_norm": 1.8758979222370529, "kl": 0.011889316607266665, "learning_rate": 2e-07, "loss": 0.016926100850105284, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.26666667610406875, "reward_std": 0.35766717195510866, "rewards/MultiModalAccuracyORM": 0.26666667610406875, "step": 1335, "train_speed(iter/s)": 0.031406 }, { "clip_ratio": 0.0, "completion_length": 260.65, "epoch": 0.5414141414141415, "grad_norm": 1.7504504529530354, "kl": 0.009696374088525772, "learning_rate": 2e-07, "loss": 0.0762328028678894, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.28333334177732467, "reward_std": 0.36349606812000274, "rewards/MultiModalAccuracyORM": 0.28333334177732467, "step": 1340, "train_speed(iter/s)": 0.031433 }, { "clip_ratio": 0.0, "completion_length": 257.1, "epoch": 0.5434343434343434, "grad_norm": 1.0790393664453202, "kl": 0.01006975807249546, "learning_rate": 2e-07, "loss": 0.054473668336868286, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2750000111758709, "reward_std": 0.3104086071252823, "rewards/MultiModalAccuracyORM": 0.2750000111758709, "step": 1345, "train_speed(iter/s)": 0.03143 }, { "clip_ratio": 0.0, "completion_length": 254.4, "epoch": 0.5454545454545454, "grad_norm": 0.03144888010820615, "kl": 0.009962662309408187, "learning_rate": 2e-07, "loss": 0.04643962681293488, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.1166666716337204, "reward_std": 0.18083563446998596, "rewards/MultiModalAccuracyORM": 0.1166666716337204, "step": 1350, "train_speed(iter/s)": 0.031451 }, { "clip_ratio": 0.0, "completion_length": 273.45, "epoch": 0.5474747474747474, "grad_norm": 0.8833335261829609, "kl": 0.012588053662329911, "learning_rate": 2e-07, "loss": 0.009816545248031616, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.33333333805203436, "reward_std": 0.2777854144573212, "rewards/MultiModalAccuracyORM": 0.33333333805203436, "step": 1355, "train_speed(iter/s)": 0.03148 }, { "clip_ratio": 0.0, "completion_length": 296.8, "epoch": 0.5494949494949495, "grad_norm": 1.2021920029810926, "kl": 0.010357017442584038, "learning_rate": 2e-07, "loss": 0.0067857176065444945, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.20000000819563865, "reward_std": 0.28529676198959353, "rewards/MultiModalAccuracyORM": 0.20000000819563865, "step": 1360, "train_speed(iter/s)": 0.031501 }, { "clip_ratio": 0.0, "completion_length": 346.05, "epoch": 0.5515151515151515, "grad_norm": 1.0531405923571842, "kl": 0.009836095664650202, "learning_rate": 2e-07, "loss": -0.004297977685928345, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.21666667461395264, "reward_std": 0.2815410941839218, "rewards/MultiModalAccuracyORM": 0.21666667461395264, "step": 1365, "train_speed(iter/s)": 0.031525 }, { "clip_ratio": 0.0, "completion_length": 225.5, "epoch": 0.5535353535353535, "grad_norm": 0.6717065174636488, "kl": 0.00997301978059113, "learning_rate": 2e-07, "loss": -0.06482144594192504, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2416666753590107, "reward_std": 0.3402847766876221, "rewards/MultiModalAccuracyORM": 0.2416666753590107, "step": 1370, "train_speed(iter/s)": 0.03155 }, { "clip_ratio": 0.0, "completion_length": 216.2, "epoch": 0.5555555555555556, "grad_norm": 1.3753758040643629, "kl": 0.012827477231621743, "learning_rate": 2e-07, "loss": 0.002781185507774353, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.308333333581686, "reward_std": 0.17154421210289, "rewards/MultiModalAccuracyORM": 0.308333333581686, "step": 1375, "train_speed(iter/s)": 0.031568 }, { "clip_ratio": 0.0, "completion_length": 315.3, "epoch": 0.5575757575757576, "grad_norm": 1.182226370113768, "kl": 0.011787687614560127, "learning_rate": 2e-07, "loss": 0.047075501084327696, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.18333333656191825, "reward_std": 0.28803746998310087, "rewards/MultiModalAccuracyORM": 0.18333333656191825, "step": 1380, "train_speed(iter/s)": 0.031595 }, { "clip_ratio": 0.0, "completion_length": 263.9, "epoch": 0.5595959595959596, "grad_norm": 0.7310584791826561, "kl": 0.009143536072224378, "learning_rate": 2e-07, "loss": 0.02444952130317688, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.28333334177732467, "reward_std": 0.39707074761390687, "rewards/MultiModalAccuracyORM": 0.28333334177732467, "step": 1385, "train_speed(iter/s)": 0.031597 }, { "clip_ratio": 0.0, "completion_length": 169.25, "epoch": 0.5616161616161616, "grad_norm": 0.07084639511270675, "kl": 0.014022548403590917, "learning_rate": 2e-07, "loss": 0.010427016019821166, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.24166667386889457, "reward_std": 0.3523798406124115, "rewards/MultiModalAccuracyORM": 0.24166667386889457, "step": 1390, "train_speed(iter/s)": 0.031647 }, { "clip_ratio": 0.0, "completion_length": 313.95, "epoch": 0.5636363636363636, "grad_norm": 1.0165143719005905, "kl": 0.00867614927701652, "learning_rate": 2e-07, "loss": 0.013519459962844848, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.35833334028720853, "reward_std": 0.4385393440723419, "rewards/MultiModalAccuracyORM": 0.35833334028720853, "step": 1395, "train_speed(iter/s)": 0.031669 }, { "clip_ratio": 0.0, "completion_length": 316.65, "epoch": 0.5656565656565656, "grad_norm": 1.4260416543225247, "kl": 0.012046672217547894, "learning_rate": 2e-07, "loss": 0.052398312091827395, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.22500000447034835, "reward_std": 0.23631438612937927, "rewards/MultiModalAccuracyORM": 0.22500000447034835, "step": 1400, "train_speed(iter/s)": 0.031689 }, { "clip_ratio": 0.0, "completion_length": 271.95, "epoch": 0.5676767676767677, "grad_norm": 0.9876833902717693, "kl": 0.011473514698445797, "learning_rate": 2e-07, "loss": 0.012492635846138, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.22500000298023223, "reward_std": 0.2682041823863983, "rewards/MultiModalAccuracyORM": 0.22500000298023223, "step": 1405, "train_speed(iter/s)": 0.031712 }, { "clip_ratio": 0.0, "completion_length": 326.5, "epoch": 0.5696969696969697, "grad_norm": 1.3575244353002172, "kl": 0.00927637224085629, "learning_rate": 2e-07, "loss": -0.0041919216513633725, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.308333345502615, "reward_std": 0.3556593209505081, "rewards/MultiModalAccuracyORM": 0.308333345502615, "step": 1410, "train_speed(iter/s)": 0.031729 }, { "clip_ratio": 0.0, "completion_length": 445.25, "epoch": 0.5717171717171717, "grad_norm": 0.47740004196130253, "kl": 0.011411032918840647, "learning_rate": 2e-07, "loss": 0.042676869034767154, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2250000074505806, "reward_std": 0.31645613312721255, "rewards/MultiModalAccuracyORM": 0.2250000074505806, "step": 1415, "train_speed(iter/s)": 0.031733 }, { "clip_ratio": 0.0, "completion_length": 286.7, "epoch": 0.5737373737373738, "grad_norm": 0.8014116239502665, "kl": 0.008727412531152367, "learning_rate": 2e-07, "loss": 0.008877889811992645, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.20000000447034835, "reward_std": 0.20973873138427734, "rewards/MultiModalAccuracyORM": 0.20000000447034835, "step": 1420, "train_speed(iter/s)": 0.031744 }, { "clip_ratio": 0.0, "completion_length": 208.8, "epoch": 0.5757575757575758, "grad_norm": 0.5960886338338734, "kl": 0.01004549846984446, "learning_rate": 2e-07, "loss": 0.07170453071594238, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.3666666753590107, "reward_std": 0.3089091956615448, "rewards/MultiModalAccuracyORM": 0.3666666753590107, "step": 1425, "train_speed(iter/s)": 0.031773 }, { "clip_ratio": 0.0, "completion_length": 338.7, "epoch": 0.5777777777777777, "grad_norm": 0.8709949111887233, "kl": 0.01054220967926085, "learning_rate": 2e-07, "loss": 0.0183966726064682, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.20000000298023224, "reward_std": 0.24255654215812683, "rewards/MultiModalAccuracyORM": 0.20000000298023224, "step": 1430, "train_speed(iter/s)": 0.031768 }, { "clip_ratio": 0.0, "completion_length": 383.5, "epoch": 0.5797979797979798, "grad_norm": 0.4149003361127432, "kl": 0.00967580354772508, "learning_rate": 2e-07, "loss": 0.01183580830693245, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2833333402872086, "reward_std": 0.3471368789672852, "rewards/MultiModalAccuracyORM": 0.2833333402872086, "step": 1435, "train_speed(iter/s)": 0.031771 }, { "clip_ratio": 0.0, "completion_length": 288.05, "epoch": 0.5818181818181818, "grad_norm": 1.8127646049079418, "kl": 0.011693871626630426, "learning_rate": 2e-07, "loss": -0.017411130666732787, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.20000000074505805, "reward_std": 0.36495934426784515, "rewards/MultiModalAccuracyORM": 0.20000000074505805, "step": 1440, "train_speed(iter/s)": 0.031798 }, { "clip_ratio": 0.0, "completion_length": 266.7, "epoch": 0.5838383838383838, "grad_norm": 1.0167459169189659, "kl": 0.011920861806720496, "learning_rate": 2e-07, "loss": -0.002490566670894623, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.23333333656191826, "reward_std": 0.3312538951635361, "rewards/MultiModalAccuracyORM": 0.23333333656191826, "step": 1445, "train_speed(iter/s)": 0.031834 }, { "clip_ratio": 0.0, "completion_length": 372.8, "epoch": 0.5858585858585859, "grad_norm": 0.43569174707905806, "kl": 0.010208403388969601, "learning_rate": 2e-07, "loss": 0.004625104367733002, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.18333334103226662, "reward_std": 0.31266486942768096, "rewards/MultiModalAccuracyORM": 0.18333334103226662, "step": 1450, "train_speed(iter/s)": 0.031851 }, { "clip_ratio": 0.0, "completion_length": 305.55, "epoch": 0.5878787878787879, "grad_norm": 1.2047141815062223, "kl": 0.012308929720893503, "learning_rate": 2e-07, "loss": 0.013238468766212463, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.05000000149011612, "reward_std": 0.13558491468429565, "rewards/MultiModalAccuracyORM": 0.05000000149011612, "step": 1455, "train_speed(iter/s)": 0.031866 }, { "clip_ratio": 0.0, "completion_length": 310.75, "epoch": 0.5898989898989899, "grad_norm": 1.3779114361595524, "kl": 0.009801013302057982, "learning_rate": 2e-07, "loss": 0.02388697862625122, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.3250000111758709, "reward_std": 0.43350920975208285, "rewards/MultiModalAccuracyORM": 0.3250000111758709, "step": 1460, "train_speed(iter/s)": 0.031876 }, { "clip_ratio": 0.0, "completion_length": 269.7, "epoch": 0.591919191919192, "grad_norm": 1.8826359120192269, "kl": 0.011588224535807967, "learning_rate": 2e-07, "loss": 0.0029266417026519776, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.24166667237877845, "reward_std": 0.3345689594745636, "rewards/MultiModalAccuracyORM": 0.24166667237877845, "step": 1465, "train_speed(iter/s)": 0.031906 }, { "clip_ratio": 0.0, "completion_length": 308.7, "epoch": 0.593939393939394, "grad_norm": 0.9727598907618097, "kl": 0.012582354433834552, "learning_rate": 2e-07, "loss": 0.04956952333450317, "memory(GiB)": 67.41, "response_clip_ratio": 0.05, "reward": 0.24166667386889457, "reward_std": 0.27148365676403047, "rewards/MultiModalAccuracyORM": 0.24166667386889457, "step": 1470, "train_speed(iter/s)": 0.03192 }, { "clip_ratio": 0.0, "completion_length": 438.0, "epoch": 0.5959595959595959, "grad_norm": 0.986971470103617, "kl": 0.009621695009991526, "learning_rate": 2e-07, "loss": 0.02806570827960968, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.21666666939854623, "reward_std": 0.28752902448177337, "rewards/MultiModalAccuracyORM": 0.21666666939854623, "step": 1475, "train_speed(iter/s)": 0.031935 }, { "clip_ratio": 0.0, "completion_length": 232.65, "epoch": 0.597979797979798, "grad_norm": 2.281672700874799, "kl": 0.014227775321342052, "learning_rate": 2e-07, "loss": -0.016278558969497682, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.3000000067055225, "reward_std": 0.3494287371635437, "rewards/MultiModalAccuracyORM": 0.3000000067055225, "step": 1480, "train_speed(iter/s)": 0.031979 }, { "clip_ratio": 0.0, "completion_length": 297.4, "epoch": 0.6, "grad_norm": 0.4294218493479977, "kl": 0.010013082064688206, "learning_rate": 2e-07, "loss": 0.020896130800247194, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.23333334550261497, "reward_std": 0.2818193256855011, "rewards/MultiModalAccuracyORM": 0.23333334550261497, "step": 1485, "train_speed(iter/s)": 0.031988 }, { "clip_ratio": 0.0, "completion_length": 357.2, "epoch": 0.602020202020202, "grad_norm": 1.4950104429019768, "kl": 0.010283974278718234, "learning_rate": 2e-07, "loss": -0.016252765059471132, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.1250000037252903, "reward_std": 0.26292563080787656, "rewards/MultiModalAccuracyORM": 0.1250000037252903, "step": 1490, "train_speed(iter/s)": 0.032019 }, { "clip_ratio": 0.0, "completion_length": 299.35, "epoch": 0.604040404040404, "grad_norm": 1.0324610637121567, "kl": 0.012730671325698495, "learning_rate": 2e-07, "loss": 0.07304045557975769, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.3250000074505806, "reward_std": 0.42524099349975586, "rewards/MultiModalAccuracyORM": 0.3250000074505806, "step": 1495, "train_speed(iter/s)": 0.032051 }, { "epoch": 0.6060606060606061, "grad_norm": 1.0986021903349583, "learning_rate": 2e-07, "loss": 0.02047921419143677, "memory(GiB)": 67.41, "step": 1500, "train_speed(iter/s)": 0.032077 }, { "epoch": 0.6060606060606061, "eval_clip_ratio": 0.0, "eval_completion_length": 311.67167556762695, "eval_kl": 0.013875643741339445, "eval_loss": 0.022496523335576057, "eval_response_clip_ratio": 0.005000000149011612, "eval_reward": 0.27333333969116214, "eval_reward_std": 0.3327353143692017, "eval_rewards/MultiModalAccuracyORM": 0.27333333969116214, "eval_runtime": 606.8844, "eval_samples_per_second": 0.082, "eval_steps_per_second": 0.008, "step": 1500 }, { "clip_ratio": 0.0, "completion_length": 313.4, "epoch": 0.6080808080808081, "grad_norm": 1.6347943339220123, "kl": 0.012441246653907001, "learning_rate": 2e-07, "loss": -0.0039748698472976685, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.25416667349636557, "reward_std": 0.26936939507722857, "rewards/MultiModalAccuracyORM": 0.25416667349636557, "step": 1505, "train_speed(iter/s)": 0.031443 }, { "clip_ratio": 0.0, "completion_length": 309.85, "epoch": 0.6101010101010101, "grad_norm": 0.7590195039303175, "kl": 0.012008609343320131, "learning_rate": 2e-07, "loss": 0.027225631475448608, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.36666668206453323, "reward_std": 0.32297652661800386, "rewards/MultiModalAccuracyORM": 0.36666668206453323, "step": 1510, "train_speed(iter/s)": 0.031453 }, { "clip_ratio": 0.0, "completion_length": 330.05, "epoch": 0.6121212121212121, "grad_norm": 1.4989100416458765, "kl": 0.013079424249008298, "learning_rate": 2e-07, "loss": 0.020418940484523772, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.30833334028720855, "reward_std": 0.4385393440723419, "rewards/MultiModalAccuracyORM": 0.30833334028720855, "step": 1515, "train_speed(iter/s)": 0.031473 }, { "clip_ratio": 0.0, "completion_length": 506.65, "epoch": 0.6141414141414141, "grad_norm": 0.5238837736009042, "kl": 0.010060561215505004, "learning_rate": 2e-07, "loss": 0.00045015439391136167, "memory(GiB)": 67.41, "response_clip_ratio": 0.05, "reward": 0.10000000298023223, "reward_std": 0.19337954223155976, "rewards/MultiModalAccuracyORM": 0.10000000298023223, "step": 1520, "train_speed(iter/s)": 0.031458 }, { "clip_ratio": 0.0, "completion_length": 255.45, "epoch": 0.6161616161616161, "grad_norm": 1.393953370005201, "kl": 0.013004821306094528, "learning_rate": 2e-07, "loss": 0.032045644521713254, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.23333333805203438, "reward_std": 0.33676562607288363, "rewards/MultiModalAccuracyORM": 0.23333333805203438, "step": 1525, "train_speed(iter/s)": 0.03147 }, { "clip_ratio": 0.0, "completion_length": 323.35, "epoch": 0.6181818181818182, "grad_norm": 0.9011779447383061, "kl": 0.012558170035481453, "learning_rate": 2e-07, "loss": 0.03226361274719238, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.3750000104308128, "reward_std": 0.31765941977500917, "rewards/MultiModalAccuracyORM": 0.3750000104308128, "step": 1530, "train_speed(iter/s)": 0.031486 }, { "clip_ratio": 0.0, "completion_length": 436.35, "epoch": 0.6202020202020202, "grad_norm": 1.1053902425907327, "kl": 0.011504510790109635, "learning_rate": 2e-07, "loss": 0.006062358617782593, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.1500000037252903, "reward_std": 0.21149236261844634, "rewards/MultiModalAccuracyORM": 0.1500000037252903, "step": 1535, "train_speed(iter/s)": 0.031502 }, { "clip_ratio": 0.0, "completion_length": 301.95, "epoch": 0.6222222222222222, "grad_norm": 1.5149361621486646, "kl": 0.011535796569660306, "learning_rate": 2e-07, "loss": -0.05787181854248047, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.3416666746139526, "reward_std": 0.3985701471567154, "rewards/MultiModalAccuracyORM": 0.3416666746139526, "step": 1540, "train_speed(iter/s)": 0.031507 }, { "clip_ratio": 0.0, "completion_length": 354.75, "epoch": 0.6242424242424243, "grad_norm": 0.6317974358994652, "kl": 0.011278041498735547, "learning_rate": 2e-07, "loss": -0.008586804568767547, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.17500000223517417, "reward_std": 0.24640740752220153, "rewards/MultiModalAccuracyORM": 0.17500000223517417, "step": 1545, "train_speed(iter/s)": 0.03152 }, { "clip_ratio": 0.0, "completion_length": 272.8, "epoch": 0.6262626262626263, "grad_norm": 1.1228824311930454, "kl": 0.012291358271613716, "learning_rate": 2e-07, "loss": 0.07681397199630738, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2416666753590107, "reward_std": 0.41868501007556913, "rewards/MultiModalAccuracyORM": 0.2416666753590107, "step": 1550, "train_speed(iter/s)": 0.031545 }, { "clip_ratio": 0.0, "completion_length": 291.1, "epoch": 0.6282828282828283, "grad_norm": 1.263695035441367, "kl": 0.016078970720991494, "learning_rate": 2e-07, "loss": 0.00948096513748169, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.3166666738688946, "reward_std": 0.3677601933479309, "rewards/MultiModalAccuracyORM": 0.3166666738688946, "step": 1555, "train_speed(iter/s)": 0.03156 }, { "clip_ratio": 0.0, "completion_length": 308.05, "epoch": 0.6303030303030303, "grad_norm": 1.3808232047869045, "kl": 0.016249435674399137, "learning_rate": 2e-07, "loss": -0.009509658813476563, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2083333373069763, "reward_std": 0.31645613312721255, "rewards/MultiModalAccuracyORM": 0.2083333373069763, "step": 1560, "train_speed(iter/s)": 0.031583 }, { "clip_ratio": 0.0, "completion_length": 227.7, "epoch": 0.6323232323232323, "grad_norm": 4.203818809917105, "kl": 0.015438845753669739, "learning_rate": 2e-07, "loss": 0.056962573528289796, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.24166667312383652, "reward_std": 0.273775514960289, "rewards/MultiModalAccuracyORM": 0.24166667312383652, "step": 1565, "train_speed(iter/s)": 0.031611 }, { "clip_ratio": 0.0, "completion_length": 425.6, "epoch": 0.6343434343434343, "grad_norm": 1.1171108239340608, "kl": 0.009527648240327835, "learning_rate": 2e-07, "loss": 0.043929648399353025, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.15833334028720855, "reward_std": 0.2940494179725647, "rewards/MultiModalAccuracyORM": 0.15833334028720855, "step": 1570, "train_speed(iter/s)": 0.03162 }, { "clip_ratio": 0.0, "completion_length": 260.95, "epoch": 0.6363636363636364, "grad_norm": 1.2373925332541635, "kl": 0.014681565202772617, "learning_rate": 2e-07, "loss": 0.0012456446886062623, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.1583333395421505, "reward_std": 0.27148365676403047, "rewards/MultiModalAccuracyORM": 0.1583333395421505, "step": 1575, "train_speed(iter/s)": 0.031648 }, { "clip_ratio": 0.0, "completion_length": 233.7, "epoch": 0.6383838383838384, "grad_norm": 2.6797021271180217, "kl": 0.017097664810717107, "learning_rate": 2e-07, "loss": -0.02114928364753723, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.23333333805203438, "reward_std": 0.33300994634628295, "rewards/MultiModalAccuracyORM": 0.23333333805203438, "step": 1580, "train_speed(iter/s)": 0.031683 }, { "clip_ratio": 0.0, "completion_length": 291.7, "epoch": 0.6404040404040404, "grad_norm": 0.9201592598726605, "kl": 0.012181163858622312, "learning_rate": 2e-07, "loss": 0.04073759019374847, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.1583333358168602, "reward_std": 0.2822715133428574, "rewards/MultiModalAccuracyORM": 0.1583333358168602, "step": 1585, "train_speed(iter/s)": 0.031693 }, { "clip_ratio": 0.0, "completion_length": 264.55, "epoch": 0.6424242424242425, "grad_norm": 0.8369270384527866, "kl": 0.012538785161450506, "learning_rate": 2e-07, "loss": 0.007032622396945953, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2250000022351742, "reward_std": 0.20817729830741882, "rewards/MultiModalAccuracyORM": 0.2250000022351742, "step": 1590, "train_speed(iter/s)": 0.03169 }, { "clip_ratio": 0.0, "completion_length": 227.45, "epoch": 0.6444444444444445, "grad_norm": 1.6359484517192477, "kl": 0.016188242752105, "learning_rate": 2e-07, "loss": 0.018440821766853334, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.20000000819563865, "reward_std": 0.24637181758880616, "rewards/MultiModalAccuracyORM": 0.20000000819563865, "step": 1595, "train_speed(iter/s)": 0.031718 }, { "clip_ratio": 0.0, "completion_length": 282.65, "epoch": 0.6464646464646465, "grad_norm": 0.8089885473247238, "kl": 0.016708724852651357, "learning_rate": 2e-07, "loss": -0.05247594714164734, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2833333402872086, "reward_std": 0.34691824316978453, "rewards/MultiModalAccuracyORM": 0.2833333402872086, "step": 1600, "train_speed(iter/s)": 0.03174 }, { "clip_ratio": 0.0, "completion_length": 282.2, "epoch": 0.6484848484848484, "grad_norm": 0.9909588366261658, "kl": 0.01558589404448867, "learning_rate": 2e-07, "loss": 0.019981113076210023, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.20000000447034835, "reward_std": 0.2862814128398895, "rewards/MultiModalAccuracyORM": 0.20000000447034835, "step": 1605, "train_speed(iter/s)": 0.031763 }, { "clip_ratio": 0.0, "completion_length": 329.55, "epoch": 0.6505050505050505, "grad_norm": 0.6306196270122503, "kl": 0.012664367025718094, "learning_rate": 2e-07, "loss": -0.022629472613334655, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.08333333507180214, "reward_std": 0.20967912971973418, "rewards/MultiModalAccuracyORM": 0.08333333507180214, "step": 1610, "train_speed(iter/s)": 0.031768 }, { "clip_ratio": 0.0, "completion_length": 276.95, "epoch": 0.6525252525252525, "grad_norm": 0.7277318686643045, "kl": 0.012076504435390234, "learning_rate": 2e-07, "loss": 0.010481297969818115, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.35000000670552256, "reward_std": 0.29784067571163175, "rewards/MultiModalAccuracyORM": 0.35000000670552256, "step": 1615, "train_speed(iter/s)": 0.031784 }, { "clip_ratio": 0.0, "completion_length": 273.85, "epoch": 0.6545454545454545, "grad_norm": 0.02891952012425108, "kl": 0.014772931393235923, "learning_rate": 2e-07, "loss": 0.04711937606334686, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.3083333447575569, "reward_std": 0.3480859398841858, "rewards/MultiModalAccuracyORM": 0.3083333447575569, "step": 1620, "train_speed(iter/s)": 0.0318 }, { "clip_ratio": 0.0, "completion_length": 237.95, "epoch": 0.6565656565656566, "grad_norm": 0.8446774152498904, "kl": 0.029250213177874684, "learning_rate": 2e-07, "loss": -0.022804903984069824, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2666666738688946, "reward_std": 0.29079394936561587, "rewards/MultiModalAccuracyORM": 0.2666666738688946, "step": 1625, "train_speed(iter/s)": 0.031824 }, { "clip_ratio": 0.0, "completion_length": 341.8, "epoch": 0.6585858585858586, "grad_norm": 1.2388651468227543, "kl": 0.012895361986011267, "learning_rate": 2e-07, "loss": 0.013838109374046326, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.22500000521540642, "reward_std": 0.32376655340194704, "rewards/MultiModalAccuracyORM": 0.22500000521540642, "step": 1630, "train_speed(iter/s)": 0.031847 }, { "clip_ratio": 0.0, "completion_length": 227.05, "epoch": 0.6606060606060606, "grad_norm": 1.6308326246006755, "kl": 0.01930234730243683, "learning_rate": 2e-07, "loss": 0.030217719078063966, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.28333334252238274, "reward_std": 0.2985951125621796, "rewards/MultiModalAccuracyORM": 0.28333334252238274, "step": 1635, "train_speed(iter/s)": 0.031884 }, { "clip_ratio": 0.0, "completion_length": 407.65, "epoch": 0.6626262626262627, "grad_norm": 0.6347971168876615, "kl": 0.013101364299654961, "learning_rate": 2e-07, "loss": 0.04347882270812988, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.43333334252238276, "reward_std": 0.36113006472587583, "rewards/MultiModalAccuracyORM": 0.43333334252238276, "step": 1640, "train_speed(iter/s)": 0.031887 }, { "clip_ratio": 0.0, "completion_length": 280.25, "epoch": 0.6646464646464646, "grad_norm": 2.8764533574893005, "kl": 0.022802903782576323, "learning_rate": 2e-07, "loss": 0.008072008192539216, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.33333334401249887, "reward_std": 0.38835368156433103, "rewards/MultiModalAccuracyORM": 0.33333334401249887, "step": 1645, "train_speed(iter/s)": 0.031918 }, { "clip_ratio": 0.0, "completion_length": 280.45, "epoch": 0.6666666666666666, "grad_norm": 0.027226074705726515, "kl": 0.012724117608740926, "learning_rate": 2e-07, "loss": 0.008971738815307616, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.25833333730697633, "reward_std": 0.2652174890041351, "rewards/MultiModalAccuracyORM": 0.25833333730697633, "step": 1650, "train_speed(iter/s)": 0.031934 }, { "clip_ratio": 0.0, "completion_length": 387.9, "epoch": 0.6686868686868687, "grad_norm": 1.0372426479480876, "kl": 0.011012718360871077, "learning_rate": 2e-07, "loss": 0.046323955059051514, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.1833333395421505, "reward_std": 0.3229169249534607, "rewards/MultiModalAccuracyORM": 0.1833333395421505, "step": 1655, "train_speed(iter/s)": 0.03195 }, { "clip_ratio": 0.0, "completion_length": 315.55, "epoch": 0.6707070707070707, "grad_norm": 0.7414666681358184, "kl": 0.01629993673413992, "learning_rate": 2e-07, "loss": 0.037484565377235414, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.23333333507180215, "reward_std": 0.2813224524259567, "rewards/MultiModalAccuracyORM": 0.23333333507180215, "step": 1660, "train_speed(iter/s)": 0.031968 }, { "clip_ratio": 0.0, "completion_length": 238.0, "epoch": 0.6727272727272727, "grad_norm": 0.7494737035564301, "kl": 0.014933030121028423, "learning_rate": 2e-07, "loss": -0.010917484760284424, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.20833333656191827, "reward_std": 0.2526139736175537, "rewards/MultiModalAccuracyORM": 0.20833333656191827, "step": 1665, "train_speed(iter/s)": 0.03199 }, { "clip_ratio": 0.0, "completion_length": 221.15, "epoch": 0.6747474747474748, "grad_norm": 0.8825460973521991, "kl": 0.020983812306076287, "learning_rate": 2e-07, "loss": 0.029893827438354493, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.25833334028720856, "reward_std": 0.36012140214443206, "rewards/MultiModalAccuracyORM": 0.25833334028720856, "step": 1670, "train_speed(iter/s)": 0.032 }, { "clip_ratio": 0.0, "completion_length": 245.4, "epoch": 0.6767676767676768, "grad_norm": 0.850414025178336, "kl": 0.013601220259442926, "learning_rate": 2e-07, "loss": 0.06674546003341675, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2083333410322666, "reward_std": 0.2629852324724197, "rewards/MultiModalAccuracyORM": 0.2083333410322666, "step": 1675, "train_speed(iter/s)": 0.032024 }, { "clip_ratio": 0.0, "completion_length": 251.45, "epoch": 0.6787878787878788, "grad_norm": 1.1643033783932533, "kl": 0.01739194723777473, "learning_rate": 2e-07, "loss": -0.02118738889694214, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2250000037252903, "reward_std": 0.35642533004283905, "rewards/MultiModalAccuracyORM": 0.2250000037252903, "step": 1680, "train_speed(iter/s)": 0.032041 }, { "clip_ratio": 0.0, "completion_length": 176.45, "epoch": 0.6808080808080809, "grad_norm": 1.1839236469929488, "kl": 0.015266428608447314, "learning_rate": 2e-07, "loss": 0.0050781965255737305, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.350000012665987, "reward_std": 0.28154108822345736, "rewards/MultiModalAccuracyORM": 0.350000012665987, "step": 1685, "train_speed(iter/s)": 0.032082 }, { "clip_ratio": 0.0, "completion_length": 217.2, "epoch": 0.6828282828282828, "grad_norm": 0.02924478206081596, "kl": 0.026798779796808957, "learning_rate": 2e-07, "loss": 0.01288943737745285, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.20833334177732468, "reward_std": 0.2581707626581192, "rewards/MultiModalAccuracyORM": 0.20833334177732468, "step": 1690, "train_speed(iter/s)": 0.032119 }, { "clip_ratio": 0.0, "completion_length": 255.9, "epoch": 0.6848484848484848, "grad_norm": 0.4842656141561725, "kl": 0.07169700982049107, "learning_rate": 2e-07, "loss": 0.0162178635597229, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.3666666798293591, "reward_std": 0.3265135705471039, "rewards/MultiModalAccuracyORM": 0.3666666798293591, "step": 1695, "train_speed(iter/s)": 0.032135 }, { "clip_ratio": 0.0, "completion_length": 255.1, "epoch": 0.6868686868686869, "grad_norm": 1.6517445880602681, "kl": 0.014678607648238539, "learning_rate": 2e-07, "loss": -0.0174052894115448, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.20833333656191827, "reward_std": 0.33326417207717896, "rewards/MultiModalAccuracyORM": 0.20833333656191827, "step": 1700, "train_speed(iter/s)": 0.032154 }, { "clip_ratio": 0.0, "completion_length": 242.55, "epoch": 0.6888888888888889, "grad_norm": 1.3895903493056987, "kl": 0.023576964903622866, "learning_rate": 2e-07, "loss": -0.044988250732421874, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.28333333879709244, "reward_std": 0.28252573907375333, "rewards/MultiModalAccuracyORM": 0.28333333879709244, "step": 1705, "train_speed(iter/s)": 0.032181 }, { "clip_ratio": 0.0, "completion_length": 309.2, "epoch": 0.6909090909090909, "grad_norm": 1.0976488092373375, "kl": 0.013535353261977435, "learning_rate": 2e-07, "loss": 0.02341702878475189, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2500000074505806, "reward_std": 0.32902405858039857, "rewards/MultiModalAccuracyORM": 0.2500000074505806, "step": 1710, "train_speed(iter/s)": 0.032206 }, { "clip_ratio": 0.0, "completion_length": 281.5, "epoch": 0.692929292929293, "grad_norm": 0.8889266103149475, "kl": 0.019961224216967822, "learning_rate": 2e-07, "loss": 0.019682276248931884, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.35833333879709245, "reward_std": 0.27934442162513734, "rewards/MultiModalAccuracyORM": 0.35833333879709245, "step": 1715, "train_speed(iter/s)": 0.032218 }, { "clip_ratio": 0.0, "completion_length": 247.75, "epoch": 0.694949494949495, "grad_norm": 0.8073289767388547, "kl": 0.015237010596320034, "learning_rate": 2e-07, "loss": 0.017612373828887938, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2416666716337204, "reward_std": 0.38450281620025634, "rewards/MultiModalAccuracyORM": 0.2416666716337204, "step": 1720, "train_speed(iter/s)": 0.032235 }, { "clip_ratio": 0.0, "completion_length": 278.85, "epoch": 0.696969696969697, "grad_norm": 1.204716243957595, "kl": 0.015058515965938568, "learning_rate": 2e-07, "loss": 0.02674557566642761, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2750000089406967, "reward_std": 0.3480859398841858, "rewards/MultiModalAccuracyORM": 0.2750000089406967, "step": 1725, "train_speed(iter/s)": 0.032253 }, { "clip_ratio": 0.0, "completion_length": 304.3, "epoch": 0.6989898989898989, "grad_norm": 0.8965654466446099, "kl": 0.011863613128662109, "learning_rate": 2e-07, "loss": 0.055030471086502074, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.20833333805203438, "reward_std": 0.35343863666057584, "rewards/MultiModalAccuracyORM": 0.20833333805203438, "step": 1730, "train_speed(iter/s)": 0.032269 }, { "clip_ratio": 0.0, "completion_length": 315.8, "epoch": 0.701010101010101, "grad_norm": 0.5431864929728417, "kl": 0.015647308621555566, "learning_rate": 2e-07, "loss": -0.039025521278381346, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.1583333358168602, "reward_std": 0.23004821836948394, "rewards/MultiModalAccuracyORM": 0.1583333358168602, "step": 1735, "train_speed(iter/s)": 0.032274 }, { "clip_ratio": 0.0, "completion_length": 218.75, "epoch": 0.703030303030303, "grad_norm": 1.7430511187722826, "kl": 0.019717163406312466, "learning_rate": 2e-07, "loss": -0.02261778712272644, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2083333395421505, "reward_std": 0.3041424334049225, "rewards/MultiModalAccuracyORM": 0.2083333395421505, "step": 1740, "train_speed(iter/s)": 0.032306 }, { "clip_ratio": 0.0, "completion_length": 243.05, "epoch": 0.705050505050505, "grad_norm": 0.036996154308780546, "kl": 0.02013384862802923, "learning_rate": 2e-07, "loss": 0.032172924280166625, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.30000000074505806, "reward_std": 0.2362758308649063, "rewards/MultiModalAccuracyORM": 0.30000000074505806, "step": 1745, "train_speed(iter/s)": 0.032334 }, { "epoch": 0.7070707070707071, "grad_norm": 1.092887250386922, "learning_rate": 2e-07, "loss": -0.023990578949451447, "memory(GiB)": 67.41, "step": 1750, "train_speed(iter/s)": 0.03234 }, { "epoch": 0.7070707070707071, "eval_clip_ratio": 0.0, "eval_completion_length": 294.09001091003415, "eval_kl": 0.023105102032423018, "eval_loss": 0.036663174629211426, "eval_response_clip_ratio": 0.006666666865348816, "eval_reward": 0.2633333384990692, "eval_reward_std": 0.31118109107017516, "eval_rewards/MultiModalAccuracyORM": 0.2633333384990692, "eval_runtime": 620.9868, "eval_samples_per_second": 0.081, "eval_steps_per_second": 0.008, "step": 1750 }, { "clip_ratio": 0.0, "completion_length": 287.0, "epoch": 0.7090909090909091, "grad_norm": 1.0538874052505927, "kl": 0.016379984514787792, "learning_rate": 2e-07, "loss": -0.005869853496551514, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.20000000596046447, "reward_std": 0.3115545302629471, "rewards/MultiModalAccuracyORM": 0.20000000596046447, "step": 1755, "train_speed(iter/s)": 0.031772 }, { "clip_ratio": 0.0, "completion_length": 266.25, "epoch": 0.7111111111111111, "grad_norm": 1.3903102724055967, "kl": 0.013312125299125909, "learning_rate": 2e-07, "loss": 0.02017918825149536, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.26666666865348815, "reward_std": 0.29776653051376345, "rewards/MultiModalAccuracyORM": 0.26666666865348815, "step": 1760, "train_speed(iter/s)": 0.031786 }, { "clip_ratio": 0.0, "completion_length": 331.8, "epoch": 0.7131313131313132, "grad_norm": 1.3836373367153505, "kl": 0.0138115500099957, "learning_rate": 2e-07, "loss": 0.03059466779232025, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.1333333358168602, "reward_std": 0.2370448112487793, "rewards/MultiModalAccuracyORM": 0.1333333358168602, "step": 1765, "train_speed(iter/s)": 0.031799 }, { "clip_ratio": 0.0, "completion_length": 271.15, "epoch": 0.7151515151515152, "grad_norm": 1.4216966273148588, "kl": 0.01580625809729099, "learning_rate": 2e-07, "loss": -0.019823677837848663, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.12500000447034837, "reward_std": 0.23631439208984376, "rewards/MultiModalAccuracyORM": 0.12500000447034837, "step": 1770, "train_speed(iter/s)": 0.031824 }, { "clip_ratio": 0.0, "completion_length": 186.55, "epoch": 0.7171717171717171, "grad_norm": 1.1462808808604095, "kl": 0.05410000858828425, "learning_rate": 2e-07, "loss": 0.040007442235946655, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2916666783392429, "reward_std": 0.3511467784643173, "rewards/MultiModalAccuracyORM": 0.2916666783392429, "step": 1775, "train_speed(iter/s)": 0.03186 }, { "clip_ratio": 0.0, "completion_length": 411.8, "epoch": 0.7191919191919192, "grad_norm": 1.1009164808291978, "kl": 0.012898495933040977, "learning_rate": 2e-07, "loss": 0.03641944527626038, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.23333334028720856, "reward_std": 0.30410684943199157, "rewards/MultiModalAccuracyORM": 0.23333334028720856, "step": 1780, "train_speed(iter/s)": 0.031851 }, { "clip_ratio": 0.0, "completion_length": 352.9, "epoch": 0.7212121212121212, "grad_norm": 2.3163685194934924, "kl": 0.016784476628527046, "learning_rate": 2e-07, "loss": 0.03075094223022461, "memory(GiB)": 67.41, "response_clip_ratio": 0.05, "reward": 0.1833333380520344, "reward_std": 0.30815233290195465, "rewards/MultiModalAccuracyORM": 0.1833333380520344, "step": 1785, "train_speed(iter/s)": 0.031872 }, { "clip_ratio": 0.0, "completion_length": 283.8, "epoch": 0.7232323232323232, "grad_norm": 1.7199397374779446, "kl": 0.01555022168904543, "learning_rate": 2e-07, "loss": -0.009939193725585938, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.20833334028720857, "reward_std": 0.29634127020835876, "rewards/MultiModalAccuracyORM": 0.20833334028720857, "step": 1790, "train_speed(iter/s)": 0.031884 }, { "clip_ratio": 0.0, "completion_length": 304.85, "epoch": 0.7252525252525253, "grad_norm": 3.3750611418125196, "kl": 0.017106020543724298, "learning_rate": 2e-07, "loss": 0.01690548360347748, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.31666667610406873, "reward_std": 0.33083729147911073, "rewards/MultiModalAccuracyORM": 0.31666667610406873, "step": 1795, "train_speed(iter/s)": 0.031895 }, { "clip_ratio": 0.0, "completion_length": 303.0, "epoch": 0.7272727272727273, "grad_norm": 0.9074495991885635, "kl": 0.02303459094837308, "learning_rate": 2e-07, "loss": 0.013132384419441223, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.3750000111758709, "reward_std": 0.35964520275592804, "rewards/MultiModalAccuracyORM": 0.3750000111758709, "step": 1800, "train_speed(iter/s)": 0.031921 }, { "clip_ratio": 0.0, "completion_length": 276.65, "epoch": 0.7292929292929293, "grad_norm": 1.5259661253214234, "kl": 0.01673535956069827, "learning_rate": 2e-07, "loss": -0.02494005113840103, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.3250000037252903, "reward_std": 0.19487895369529723, "rewards/MultiModalAccuracyORM": 0.3250000037252903, "step": 1805, "train_speed(iter/s)": 0.03194 }, { "clip_ratio": 0.0, "completion_length": 237.1, "epoch": 0.7313131313131314, "grad_norm": 23.765604936964085, "kl": 0.0391254379414022, "learning_rate": 2e-07, "loss": 0.07751191854476928, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2083333410322666, "reward_std": 0.34710128903388976, "rewards/MultiModalAccuracyORM": 0.2083333410322666, "step": 1810, "train_speed(iter/s)": 0.031961 }, { "clip_ratio": 0.0, "completion_length": 226.05, "epoch": 0.7333333333333333, "grad_norm": 1.844663010011029, "kl": 0.018137864442542194, "learning_rate": 2e-07, "loss": 0.006373977661132813, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.4500000163912773, "reward_std": 0.4026396483182907, "rewards/MultiModalAccuracyORM": 0.4500000163912773, "step": 1815, "train_speed(iter/s)": 0.031995 }, { "clip_ratio": 0.0, "completion_length": 259.55, "epoch": 0.7353535353535353, "grad_norm": 3.1736096732843753, "kl": 0.01654947120696306, "learning_rate": 2e-07, "loss": -0.03625679612159729, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.35000000819563865, "reward_std": 0.33621527552604674, "rewards/MultiModalAccuracyORM": 0.35000000819563865, "step": 1820, "train_speed(iter/s)": 0.032006 }, { "clip_ratio": 0.0, "completion_length": 273.95, "epoch": 0.7373737373737373, "grad_norm": 0.8446499426276284, "kl": 0.01737216175533831, "learning_rate": 2e-07, "loss": 0.01487790048122406, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.13333334028720856, "reward_std": 0.18083562850952148, "rewards/MultiModalAccuracyORM": 0.13333334028720856, "step": 1825, "train_speed(iter/s)": 0.032029 }, { "clip_ratio": 0.0, "completion_length": 333.2, "epoch": 0.7393939393939394, "grad_norm": 12.81365368807463, "kl": 0.017870889231562614, "learning_rate": 2e-07, "loss": -0.04711695909500122, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.13333333507180214, "reward_std": 0.20967913568019866, "rewards/MultiModalAccuracyORM": 0.13333333507180214, "step": 1830, "train_speed(iter/s)": 0.032037 }, { "clip_ratio": 0.0, "completion_length": 290.2, "epoch": 0.7414141414141414, "grad_norm": 0.8602503566731583, "kl": 0.06428629895672203, "learning_rate": 2e-07, "loss": 0.0755260705947876, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2666666738688946, "reward_std": 0.3033378630876541, "rewards/MultiModalAccuracyORM": 0.2666666738688946, "step": 1835, "train_speed(iter/s)": 0.032056 }, { "clip_ratio": 0.0, "completion_length": 217.2, "epoch": 0.7434343434343434, "grad_norm": 4.0474296652864865, "kl": 0.025846300972625615, "learning_rate": 2e-07, "loss": 0.03730224370956421, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.4250000067055225, "reward_std": 0.4119280993938446, "rewards/MultiModalAccuracyORM": 0.4250000067055225, "step": 1840, "train_speed(iter/s)": 0.032071 }, { "clip_ratio": 0.0, "completion_length": 301.95, "epoch": 0.7454545454545455, "grad_norm": 2.232537088059296, "kl": 0.01461884556338191, "learning_rate": 2e-07, "loss": -0.004309023916721344, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.26666667237877845, "reward_std": 0.313649520277977, "rewards/MultiModalAccuracyORM": 0.26666667237877845, "step": 1845, "train_speed(iter/s)": 0.032079 }, { "clip_ratio": 0.0, "completion_length": 260.05, "epoch": 0.7474747474747475, "grad_norm": 2.3016578929204963, "kl": 0.021660260390490294, "learning_rate": 2e-07, "loss": -0.03592326939105987, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.3083333395421505, "reward_std": 0.26591232419013977, "rewards/MultiModalAccuracyORM": 0.3083333395421505, "step": 1850, "train_speed(iter/s)": 0.032095 }, { "clip_ratio": 0.0, "completion_length": 240.8, "epoch": 0.7494949494949495, "grad_norm": 0.040878648566390896, "kl": 0.016377491503953935, "learning_rate": 2e-07, "loss": -0.007729291915893555, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2916666746139526, "reward_std": 0.2752989321947098, "rewards/MultiModalAccuracyORM": 0.2916666746139526, "step": 1855, "train_speed(iter/s)": 0.03211 }, { "clip_ratio": 0.0, "completion_length": 277.6, "epoch": 0.7515151515151515, "grad_norm": 1.745735435957987, "kl": 0.01565658366307616, "learning_rate": 2e-07, "loss": 0.03239756226539612, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2083333395421505, "reward_std": 0.34933354556560514, "rewards/MultiModalAccuracyORM": 0.2083333395421505, "step": 1860, "train_speed(iter/s)": 0.03212 }, { "clip_ratio": 0.0, "completion_length": 463.15, "epoch": 0.7535353535353535, "grad_norm": 1.1867439188156674, "kl": 0.013316378556191921, "learning_rate": 2e-07, "loss": 0.02554565668106079, "memory(GiB)": 67.41, "response_clip_ratio": 0.05, "reward": 0.19166667386889458, "reward_std": 0.3019101768732071, "rewards/MultiModalAccuracyORM": 0.19166667386889458, "step": 1865, "train_speed(iter/s)": 0.032117 }, { "clip_ratio": 0.0, "completion_length": 241.8, "epoch": 0.7555555555555555, "grad_norm": 1.6710102974959882, "kl": 0.0185435910243541, "learning_rate": 2e-07, "loss": 0.0040223002433776855, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.19166666865348816, "reward_std": 0.32418315708637235, "rewards/MultiModalAccuracyORM": 0.19166666865348816, "step": 1870, "train_speed(iter/s)": 0.032131 }, { "clip_ratio": 0.0, "completion_length": 219.95, "epoch": 0.7575757575757576, "grad_norm": 1.6067423711502076, "kl": 0.019126034528017043, "learning_rate": 2e-07, "loss": 0.044132769107818604, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.22500001192092894, "reward_std": 0.2815650999546051, "rewards/MultiModalAccuracyORM": 0.22500001192092894, "step": 1875, "train_speed(iter/s)": 0.032154 }, { "clip_ratio": 0.0, "completion_length": 257.55, "epoch": 0.7595959595959596, "grad_norm": 0.028316390333514352, "kl": 0.013678487855941057, "learning_rate": 2e-07, "loss": 0.011262473464012147, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.0916666679084301, "reward_std": 0.16855751872062683, "rewards/MultiModalAccuracyORM": 0.0916666679084301, "step": 1880, "train_speed(iter/s)": 0.032167 }, { "clip_ratio": 0.0, "completion_length": 411.7, "epoch": 0.7616161616161616, "grad_norm": 1.2321236683558325, "kl": 0.016270547499880196, "learning_rate": 2e-07, "loss": -0.05071290135383606, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.17500000223517417, "reward_std": 0.3174407839775085, "rewards/MultiModalAccuracyORM": 0.17500000223517417, "step": 1885, "train_speed(iter/s)": 0.032165 }, { "clip_ratio": 0.0, "completion_length": 238.25, "epoch": 0.7636363636363637, "grad_norm": 1.7168712559722545, "kl": 0.020080643892288207, "learning_rate": 2e-07, "loss": 0.014709633588790894, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.29166667014360426, "reward_std": 0.27371591329574585, "rewards/MultiModalAccuracyORM": 0.29166667014360426, "step": 1890, "train_speed(iter/s)": 0.032193 }, { "clip_ratio": 0.0, "completion_length": 297.8, "epoch": 0.7656565656565657, "grad_norm": 1.4396153411408321, "kl": 0.015443798806518316, "learning_rate": 2e-07, "loss": -0.03242262601852417, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.20833333879709243, "reward_std": 0.3297544836997986, "rewards/MultiModalAccuracyORM": 0.20833333879709243, "step": 1895, "train_speed(iter/s)": 0.032196 }, { "clip_ratio": 0.0, "completion_length": 206.0, "epoch": 0.7676767676767676, "grad_norm": 2.153471273550742, "kl": 0.019460227340459824, "learning_rate": 2e-07, "loss": 0.06560848355293274, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.3083333380520344, "reward_std": 0.23860624432563782, "rewards/MultiModalAccuracyORM": 0.3083333380520344, "step": 1900, "train_speed(iter/s)": 0.032197 }, { "clip_ratio": 0.0, "completion_length": 244.55, "epoch": 0.7696969696969697, "grad_norm": 1.0432229656314325, "kl": 0.017312650848180056, "learning_rate": 2e-07, "loss": 0.031227093935012818, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.4583333432674408, "reward_std": 0.31520852744579314, "rewards/MultiModalAccuracyORM": 0.4583333432674408, "step": 1905, "train_speed(iter/s)": 0.032215 }, { "clip_ratio": 0.0, "completion_length": 265.75, "epoch": 0.7717171717171717, "grad_norm": 1.180310161050895, "kl": 0.015469088219106197, "learning_rate": 2e-07, "loss": -0.015459638833999634, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2666666716337204, "reward_std": 0.36887405812740326, "rewards/MultiModalAccuracyORM": 0.2666666716337204, "step": 1910, "train_speed(iter/s)": 0.032224 }, { "clip_ratio": 0.0, "completion_length": 213.3, "epoch": 0.7737373737373737, "grad_norm": 0.06901782029330109, "kl": 0.020954974088817836, "learning_rate": 2e-07, "loss": 0.023259681463241578, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.3666666761040688, "reward_std": 0.3164801448583603, "rewards/MultiModalAccuracyORM": 0.3666666761040688, "step": 1915, "train_speed(iter/s)": 0.032258 }, { "clip_ratio": 0.0, "completion_length": 294.35, "epoch": 0.7757575757575758, "grad_norm": 1.2725867918795215, "kl": 0.023106640204787254, "learning_rate": 2e-07, "loss": 0.05665600299835205, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.3500000096857548, "reward_std": 0.3811877518892288, "rewards/MultiModalAccuracyORM": 0.3500000096857548, "step": 1920, "train_speed(iter/s)": 0.032278 }, { "clip_ratio": 0.0, "completion_length": 251.0, "epoch": 0.7777777777777778, "grad_norm": 1.9367541891791467, "kl": 0.05693813692778349, "learning_rate": 2e-07, "loss": 0.016718414425849915, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.4583333395421505, "reward_std": 0.343126979470253, "rewards/MultiModalAccuracyORM": 0.4583333395421505, "step": 1925, "train_speed(iter/s)": 0.032298 }, { "clip_ratio": 0.0, "completion_length": 270.8, "epoch": 0.7797979797979798, "grad_norm": 1.8056180584349435, "kl": 0.018875516019761562, "learning_rate": 2e-07, "loss": 0.026608097553253173, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.3416666746139526, "reward_std": 0.2745299518108368, "rewards/MultiModalAccuracyORM": 0.3416666746139526, "step": 1930, "train_speed(iter/s)": 0.032307 }, { "clip_ratio": 0.0, "completion_length": 275.75, "epoch": 0.7818181818181819, "grad_norm": 0.9474431031282609, "kl": 0.017304270621389152, "learning_rate": 2e-07, "loss": 0.054594576358795166, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.15833334028720855, "reward_std": 0.2940494120121002, "rewards/MultiModalAccuracyORM": 0.15833334028720855, "step": 1935, "train_speed(iter/s)": 0.032311 }, { "clip_ratio": 0.0, "completion_length": 413.95, "epoch": 0.7838383838383839, "grad_norm": 0.31279661376062395, "kl": 0.015814543049782515, "learning_rate": 2e-07, "loss": 0.022085633873939515, "memory(GiB)": 67.41, "response_clip_ratio": 0.05, "reward": 0.2250000111758709, "reward_std": 0.2674381732940674, "rewards/MultiModalAccuracyORM": 0.2250000111758709, "step": 1940, "train_speed(iter/s)": 0.032322 }, { "clip_ratio": 0.0, "completion_length": 251.15, "epoch": 0.7858585858585858, "grad_norm": 1.153254924584669, "kl": 0.022455749101936817, "learning_rate": 2e-07, "loss": -0.005599388480186462, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.4583333432674408, "reward_std": 0.32981408536434176, "rewards/MultiModalAccuracyORM": 0.4583333432674408, "step": 1945, "train_speed(iter/s)": 0.032335 }, { "clip_ratio": 0.0, "completion_length": 284.8, "epoch": 0.7878787878787878, "grad_norm": 1.477534719499969, "kl": 0.015763588808476926, "learning_rate": 2e-07, "loss": 0.00195084810256958, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.3250000089406967, "reward_std": 0.3789910912513733, "rewards/MultiModalAccuracyORM": 0.3250000089406967, "step": 1950, "train_speed(iter/s)": 0.032341 }, { "clip_ratio": 0.0, "completion_length": 184.25, "epoch": 0.7898989898989899, "grad_norm": 2.285698337644553, "kl": 0.02175712687894702, "learning_rate": 2e-07, "loss": -0.00021601170301437377, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.3416666775941849, "reward_std": 0.3043610692024231, "rewards/MultiModalAccuracyORM": 0.3416666775941849, "step": 1955, "train_speed(iter/s)": 0.032362 }, { "clip_ratio": 0.0, "completion_length": 240.6, "epoch": 0.7919191919191919, "grad_norm": 1.2460067583966314, "kl": 0.019805201794952154, "learning_rate": 2e-07, "loss": 0.016333769261837005, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.12500000074505807, "reward_std": 0.24869927167892455, "rewards/MultiModalAccuracyORM": 0.12500000074505807, "step": 1960, "train_speed(iter/s)": 0.032387 }, { "clip_ratio": 0.0, "completion_length": 294.15, "epoch": 0.793939393939394, "grad_norm": 1.2892695908066818, "kl": 0.016942942142486574, "learning_rate": 2e-07, "loss": 0.05399552583694458, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.3333333425223827, "reward_std": 0.32777645289897916, "rewards/MultiModalAccuracyORM": 0.3333333425223827, "step": 1965, "train_speed(iter/s)": 0.032394 }, { "clip_ratio": 0.0, "completion_length": 286.5, "epoch": 0.795959595959596, "grad_norm": 1.15365791846424, "kl": 0.015720244217664003, "learning_rate": 2e-07, "loss": 0.03231356143951416, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.31666667461395265, "reward_std": 0.35792473554611204, "rewards/MultiModalAccuracyORM": 0.31666667461395265, "step": 1970, "train_speed(iter/s)": 0.032404 }, { "clip_ratio": 0.0, "completion_length": 256.7, "epoch": 0.797979797979798, "grad_norm": 2.0846813962071784, "kl": 0.02078899824991822, "learning_rate": 2e-07, "loss": 0.0010771095752716065, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.36666667759418486, "reward_std": 0.34783414006233215, "rewards/MultiModalAccuracyORM": 0.36666667759418486, "step": 1975, "train_speed(iter/s)": 0.032425 }, { "clip_ratio": 0.0, "completion_length": 291.3, "epoch": 0.8, "grad_norm": 1.8113050624363713, "kl": 0.02985860425978899, "learning_rate": 2e-07, "loss": -0.02335626631975174, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2416666716337204, "reward_std": 0.2940494120121002, "rewards/MultiModalAccuracyORM": 0.2416666716337204, "step": 1980, "train_speed(iter/s)": 0.032446 }, { "clip_ratio": 0.0, "completion_length": 306.5, "epoch": 0.802020202020202, "grad_norm": 0.564590741108868, "kl": 0.023125759046524762, "learning_rate": 2e-07, "loss": -0.003053317964076996, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.3250000074505806, "reward_std": 0.2599243938922882, "rewards/MultiModalAccuracyORM": 0.3250000074505806, "step": 1985, "train_speed(iter/s)": 0.032463 }, { "clip_ratio": 0.0, "completion_length": 255.45, "epoch": 0.804040404040404, "grad_norm": 1.8493252896290147, "kl": 0.026984267216175795, "learning_rate": 2e-07, "loss": -0.06052778363227844, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2500000074505806, "reward_std": 0.3063987076282501, "rewards/MultiModalAccuracyORM": 0.2500000074505806, "step": 1990, "train_speed(iter/s)": 0.032484 }, { "clip_ratio": 0.0, "completion_length": 238.1, "epoch": 0.806060606060606, "grad_norm": 1.0923781003773394, "kl": 0.01942981770262122, "learning_rate": 2e-07, "loss": 0.013067781925201416, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.19166667014360428, "reward_std": 0.37893148958683015, "rewards/MultiModalAccuracyORM": 0.19166667014360428, "step": 1995, "train_speed(iter/s)": 0.032491 }, { "epoch": 0.8080808080808081, "grad_norm": 1.528791414674691, "learning_rate": 2e-07, "loss": -0.023299628496170045, "memory(GiB)": 67.41, "step": 2000, "train_speed(iter/s)": 0.032518 }, { "epoch": 0.8080808080808081, "eval_clip_ratio": 0.0, "eval_completion_length": 260.0916757965088, "eval_kl": 0.03659271189942956, "eval_loss": 0.022981010377407074, "eval_response_clip_ratio": 0.001666666716337204, "eval_reward": 0.3116666758060455, "eval_reward_std": 0.3461023557186127, "eval_rewards/MultiModalAccuracyORM": 0.3116666758060455, "eval_runtime": 577.7727, "eval_samples_per_second": 0.087, "eval_steps_per_second": 0.009, "step": 2000 }, { "clip_ratio": 0.0, "completion_length": 256.1, "epoch": 0.8101010101010101, "grad_norm": 0.6858703005476166, "kl": 0.021803660970181226, "learning_rate": 2e-07, "loss": -0.01412125825881958, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.17916667126119137, "reward_std": 0.27865810990333556, "rewards/MultiModalAccuracyORM": 0.17916667126119137, "step": 2005, "train_speed(iter/s)": 0.032026 }, { "clip_ratio": 0.0, "completion_length": 235.3, "epoch": 0.8121212121212121, "grad_norm": 1.240645294227265, "kl": 0.021775086782872675, "learning_rate": 2e-07, "loss": -0.009345543384552003, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2666666738688946, "reward_std": 0.25591449439525604, "rewards/MultiModalAccuracyORM": 0.2666666738688946, "step": 2010, "train_speed(iter/s)": 0.032055 }, { "clip_ratio": 0.0, "completion_length": 274.75, "epoch": 0.8141414141414142, "grad_norm": 0.8866953662012617, "kl": 0.025209260638803244, "learning_rate": 2e-07, "loss": -0.007123380899429321, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.33333334103226664, "reward_std": 0.38077115416526797, "rewards/MultiModalAccuracyORM": 0.33333334103226664, "step": 2015, "train_speed(iter/s)": 0.032076 }, { "clip_ratio": 0.0, "completion_length": 224.65, "epoch": 0.8161616161616162, "grad_norm": 2.421458976490517, "kl": 0.019219990959390996, "learning_rate": 2e-07, "loss": -0.046216756105422974, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.25833334028720856, "reward_std": 0.3353258162736893, "rewards/MultiModalAccuracyORM": 0.25833334028720856, "step": 2020, "train_speed(iter/s)": 0.032083 }, { "clip_ratio": 0.0, "completion_length": 354.7, "epoch": 0.8181818181818182, "grad_norm": 1.323113651332219, "kl": 0.0228486392647028, "learning_rate": 2e-07, "loss": 0.018266260623931885, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.4083333402872086, "reward_std": 0.22224705517292023, "rewards/MultiModalAccuracyORM": 0.4083333402872086, "step": 2025, "train_speed(iter/s)": 0.032101 }, { "clip_ratio": 0.0, "completion_length": 277.3, "epoch": 0.8202020202020202, "grad_norm": 1.351077195712627, "kl": 0.020902801770716906, "learning_rate": 2e-07, "loss": 0.061953216791152954, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.3666666768491268, "reward_std": 0.39707074165344236, "rewards/MultiModalAccuracyORM": 0.3666666768491268, "step": 2030, "train_speed(iter/s)": 0.032117 }, { "clip_ratio": 0.0, "completion_length": 417.7, "epoch": 0.8222222222222222, "grad_norm": 1.2938426116281756, "kl": 0.016236740909516812, "learning_rate": 2e-07, "loss": 0.06566336154937744, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.25000000596046446, "reward_std": 0.32052563428878783, "rewards/MultiModalAccuracyORM": 0.25000000596046446, "step": 2035, "train_speed(iter/s)": 0.032133 }, { "clip_ratio": 0.0, "completion_length": 379.7, "epoch": 0.8242424242424242, "grad_norm": 0.037993510667949114, "kl": 0.018295575771480797, "learning_rate": 2e-07, "loss": -0.04348133802413941, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.16666667014360428, "reward_std": 0.3227578908205032, "rewards/MultiModalAccuracyORM": 0.16666667014360428, "step": 2040, "train_speed(iter/s)": 0.032143 }, { "clip_ratio": 0.0, "completion_length": 188.4, "epoch": 0.8262626262626263, "grad_norm": 1.7568452367664746, "kl": 0.022255995497107505, "learning_rate": 2e-07, "loss": -0.010665307939052581, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.32500000670552254, "reward_std": 0.36062985360622407, "rewards/MultiModalAccuracyORM": 0.32500000670552254, "step": 2045, "train_speed(iter/s)": 0.032157 }, { "clip_ratio": 0.0, "completion_length": 324.9, "epoch": 0.8282828282828283, "grad_norm": 0.3584645693525935, "kl": 0.026058319211006164, "learning_rate": 2e-07, "loss": 0.008829855918884277, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.13333333805203437, "reward_std": 0.26518189907073975, "rewards/MultiModalAccuracyORM": 0.13333333805203437, "step": 2050, "train_speed(iter/s)": 0.032167 }, { "clip_ratio": 0.0, "completion_length": 268.7, "epoch": 0.8303030303030303, "grad_norm": 1.0414808041792807, "kl": 0.024912268854677677, "learning_rate": 2e-07, "loss": -0.0169498473405838, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2666666731238365, "reward_std": 0.3943714380264282, "rewards/MultiModalAccuracyORM": 0.2666666731238365, "step": 2055, "train_speed(iter/s)": 0.032187 }, { "clip_ratio": 0.0, "completion_length": 387.25, "epoch": 0.8323232323232324, "grad_norm": 0.8330776026520852, "kl": 0.017706521693617104, "learning_rate": 2e-07, "loss": -0.0015785574913024902, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.33333334475755694, "reward_std": 0.3066769391298294, "rewards/MultiModalAccuracyORM": 0.33333334475755694, "step": 2060, "train_speed(iter/s)": 0.032184 }, { "clip_ratio": 0.0, "completion_length": 361.8, "epoch": 0.8343434343434344, "grad_norm": 1.286067527047748, "kl": 0.016120643261820077, "learning_rate": 2e-07, "loss": 0.010149773955345155, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.1583333395421505, "reward_std": 0.30636311173439024, "rewards/MultiModalAccuracyORM": 0.1583333395421505, "step": 2065, "train_speed(iter/s)": 0.032193 }, { "clip_ratio": 0.0, "completion_length": 341.9, "epoch": 0.8363636363636363, "grad_norm": 0.8716186001419639, "kl": 0.025367347244173288, "learning_rate": 2e-07, "loss": -0.00466080904006958, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.35000001192092894, "reward_std": 0.29784067571163175, "rewards/MultiModalAccuracyORM": 0.35000001192092894, "step": 2070, "train_speed(iter/s)": 0.032191 }, { "clip_ratio": 0.0, "completion_length": 228.15, "epoch": 0.8383838383838383, "grad_norm": 2.4195845756657106, "kl": 0.027777089178562163, "learning_rate": 2e-07, "loss": 0.0020249992609024047, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.30833333879709246, "reward_std": 0.2885376811027527, "rewards/MultiModalAccuracyORM": 0.30833333879709246, "step": 2075, "train_speed(iter/s)": 0.032214 }, { "clip_ratio": 0.0, "completion_length": 252.1, "epoch": 0.8404040404040404, "grad_norm": 1.3049618894030401, "kl": 0.055896259006112815, "learning_rate": 2e-07, "loss": 0.02880297303199768, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.3250000037252903, "reward_std": 0.2853323519229889, "rewards/MultiModalAccuracyORM": 0.3250000037252903, "step": 2080, "train_speed(iter/s)": 0.032232 }, { "clip_ratio": 0.0, "completion_length": 407.05, "epoch": 0.8424242424242424, "grad_norm": 0.8244280245530798, "kl": 0.02422601543366909, "learning_rate": 2e-07, "loss": -0.006161260604858399, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2500000074505806, "reward_std": 0.29560841917991637, "rewards/MultiModalAccuracyORM": 0.2500000074505806, "step": 2085, "train_speed(iter/s)": 0.032237 }, { "clip_ratio": 0.0, "completion_length": 281.9, "epoch": 0.8444444444444444, "grad_norm": 1.3048496117538315, "kl": 0.01891004741191864, "learning_rate": 2e-07, "loss": -0.049727249145507815, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.14166666865348815, "reward_std": 0.3290000528097153, "rewards/MultiModalAccuracyORM": 0.14166666865348815, "step": 2090, "train_speed(iter/s)": 0.032254 }, { "clip_ratio": 0.0, "completion_length": 266.05, "epoch": 0.8464646464646465, "grad_norm": 1.1403945225556158, "kl": 0.01645102323964238, "learning_rate": 2e-07, "loss": -0.013469058275222778, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.3166666716337204, "reward_std": 0.21999078691005708, "rewards/MultiModalAccuracyORM": 0.3166666716337204, "step": 2095, "train_speed(iter/s)": 0.032269 }, { "clip_ratio": 0.0, "completion_length": 410.45, "epoch": 0.8484848484848485, "grad_norm": 0.9707324672728811, "kl": 0.029299197159707545, "learning_rate": 2e-07, "loss": 0.03684330582618713, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.37500000521540644, "reward_std": 0.2792848199605942, "rewards/MultiModalAccuracyORM": 0.37500000521540644, "step": 2100, "train_speed(iter/s)": 0.032273 }, { "clip_ratio": 0.0, "completion_length": 291.2, "epoch": 0.8505050505050505, "grad_norm": 1.218289932401983, "kl": 0.02226941576227546, "learning_rate": 2e-07, "loss": 0.049808406829833986, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.20000000521540642, "reward_std": 0.3563301384449005, "rewards/MultiModalAccuracyORM": 0.20000000521540642, "step": 2105, "train_speed(iter/s)": 0.032281 }, { "clip_ratio": 0.0, "completion_length": 242.25, "epoch": 0.8525252525252526, "grad_norm": 0.5870668188140559, "kl": 0.07989006163552403, "learning_rate": 2e-07, "loss": 0.03518458604812622, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.5250000037252903, "reward_std": 0.21374863088130952, "rewards/MultiModalAccuracyORM": 0.5250000037252903, "step": 2110, "train_speed(iter/s)": 0.032307 }, { "clip_ratio": 0.0, "completion_length": 293.3, "epoch": 0.8545454545454545, "grad_norm": 0.020336894429096215, "kl": 0.037888656742870806, "learning_rate": 2e-07, "loss": -0.014006611704826356, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.17500000447034836, "reward_std": 0.2792848199605942, "rewards/MultiModalAccuracyORM": 0.17500000447034836, "step": 2115, "train_speed(iter/s)": 0.032314 }, { "clip_ratio": 0.0, "completion_length": 372.15, "epoch": 0.8565656565656565, "grad_norm": 1.21835723660335, "kl": 0.017908666748553514, "learning_rate": 2e-07, "loss": -0.01713634133338928, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.28333333879709244, "reward_std": 0.40485736131668093, "rewards/MultiModalAccuracyORM": 0.28333333879709244, "step": 2120, "train_speed(iter/s)": 0.032326 }, { "clip_ratio": 0.0, "completion_length": 331.75, "epoch": 0.8585858585858586, "grad_norm": 2.0067295271647985, "kl": 0.026275785733014347, "learning_rate": 2e-07, "loss": 0.05382862687110901, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.24166667684912682, "reward_std": 0.37440980076789854, "rewards/MultiModalAccuracyORM": 0.24166667684912682, "step": 2125, "train_speed(iter/s)": 0.032342 }, { "clip_ratio": 0.0, "completion_length": 242.4, "epoch": 0.8606060606060606, "grad_norm": 1.6449849094427824, "kl": 0.026703681144863368, "learning_rate": 2e-07, "loss": 0.07567849159240722, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.3166666775941849, "reward_std": 0.383000984787941, "rewards/MultiModalAccuracyORM": 0.3166666775941849, "step": 2130, "train_speed(iter/s)": 0.032348 }, { "clip_ratio": 0.0, "completion_length": 273.25, "epoch": 0.8626262626262626, "grad_norm": 1.015338618446623, "kl": 0.02266251090914011, "learning_rate": 2e-07, "loss": 0.0509304940700531, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.1666666679084301, "reward_std": 0.29462080299854276, "rewards/MultiModalAccuracyORM": 0.1666666679084301, "step": 2135, "train_speed(iter/s)": 0.032357 }, { "clip_ratio": 0.0, "completion_length": 221.85, "epoch": 0.8646464646464647, "grad_norm": 1.431447102184734, "kl": 0.05701554603874683, "learning_rate": 2e-07, "loss": -0.008281412720680236, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.3166666693985462, "reward_std": 0.23030244410037995, "rewards/MultiModalAccuracyORM": 0.3166666693985462, "step": 2140, "train_speed(iter/s)": 0.032369 }, { "clip_ratio": 0.0, "completion_length": 233.3, "epoch": 0.8666666666666667, "grad_norm": 1.6811314333402767, "kl": 0.06945961127057672, "learning_rate": 2e-07, "loss": 0.04042296409606934, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.3333333387970924, "reward_std": 0.28154108822345736, "rewards/MultiModalAccuracyORM": 0.3333333387970924, "step": 2145, "train_speed(iter/s)": 0.032395 }, { "clip_ratio": 0.0, "completion_length": 244.2, "epoch": 0.8686868686868687, "grad_norm": 1.7106832961427025, "kl": 0.02610405897721648, "learning_rate": 2e-07, "loss": 0.018969109654426573, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2333333395421505, "reward_std": 0.316710364818573, "rewards/MultiModalAccuracyORM": 0.2333333395421505, "step": 2150, "train_speed(iter/s)": 0.032414 }, { "clip_ratio": 0.0, "completion_length": 241.85, "epoch": 0.8707070707070707, "grad_norm": 2.2093533026187493, "kl": 0.027732077380642296, "learning_rate": 2e-07, "loss": -0.04979143142700195, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.1500000014901161, "reward_std": 0.2782616138458252, "rewards/MultiModalAccuracyORM": 0.1500000014901161, "step": 2155, "train_speed(iter/s)": 0.032415 }, { "clip_ratio": 0.0, "completion_length": 361.4, "epoch": 0.8727272727272727, "grad_norm": 1.4196000317723967, "kl": 0.017831438966095448, "learning_rate": 2e-07, "loss": -0.016922876238822937, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.1000000037252903, "reward_std": 0.18733201026916504, "rewards/MultiModalAccuracyORM": 0.1000000037252903, "step": 2160, "train_speed(iter/s)": 0.032428 }, { "clip_ratio": 0.0, "completion_length": 310.35, "epoch": 0.8747474747474747, "grad_norm": 1.2738059173790044, "kl": 0.027828316576778887, "learning_rate": 2e-07, "loss": 0.0020169079303741454, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.43333334401249884, "reward_std": 0.3993601739406586, "rewards/MultiModalAccuracyORM": 0.43333334401249884, "step": 2165, "train_speed(iter/s)": 0.032444 }, { "clip_ratio": 0.0, "completion_length": 315.65, "epoch": 0.8767676767676768, "grad_norm": 0.5711871148221249, "kl": 0.021445599384605885, "learning_rate": 2e-07, "loss": -0.021869242191314697, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2083333373069763, "reward_std": 0.2652174890041351, "rewards/MultiModalAccuracyORM": 0.2083333373069763, "step": 2170, "train_speed(iter/s)": 0.032448 }, { "clip_ratio": 0.0, "completion_length": 323.3, "epoch": 0.8787878787878788, "grad_norm": 0.7831111399049216, "kl": 0.02290212018415332, "learning_rate": 2e-07, "loss": 0.0058018617331981655, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.20833333805203438, "reward_std": 0.2191862165927887, "rewards/MultiModalAccuracyORM": 0.20833333805203438, "step": 2175, "train_speed(iter/s)": 0.032457 }, { "clip_ratio": 0.0, "completion_length": 352.4, "epoch": 0.8808080808080808, "grad_norm": 0.8065748585811442, "kl": 0.020074152015149595, "learning_rate": 2e-07, "loss": -0.01390417218208313, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2333333410322666, "reward_std": 0.355694904923439, "rewards/MultiModalAccuracyORM": 0.2333333410322666, "step": 2180, "train_speed(iter/s)": 0.032459 }, { "clip_ratio": 0.0, "completion_length": 238.9, "epoch": 0.8828282828282829, "grad_norm": 1.4489205967938936, "kl": 0.023838929925113918, "learning_rate": 2e-07, "loss": -0.004918675124645233, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2666666775941849, "reward_std": 0.350342208147049, "rewards/MultiModalAccuracyORM": 0.2666666775941849, "step": 2185, "train_speed(iter/s)": 0.032473 }, { "clip_ratio": 0.0, "completion_length": 268.05, "epoch": 0.8848484848484849, "grad_norm": 0.02924850303466511, "kl": 0.022593512199819088, "learning_rate": 2e-07, "loss": 0.023534037172794342, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.3250000037252903, "reward_std": 0.2567190647125244, "rewards/MultiModalAccuracyORM": 0.3250000037252903, "step": 2190, "train_speed(iter/s)": 0.032483 }, { "clip_ratio": 0.0, "completion_length": 301.65, "epoch": 0.8868686868686869, "grad_norm": 0.03836674408051373, "kl": 0.02357559949159622, "learning_rate": 2e-07, "loss": -0.039435860514640805, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.14166667088866233, "reward_std": 0.20519061088562013, "rewards/MultiModalAccuracyORM": 0.14166667088866233, "step": 2195, "train_speed(iter/s)": 0.032503 }, { "clip_ratio": 0.0, "completion_length": 275.5, "epoch": 0.8888888888888888, "grad_norm": 0.38821860650162454, "kl": 0.018242907989770175, "learning_rate": 2e-07, "loss": 0.012925875186920167, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.13333333805203437, "reward_std": 0.23481498062610626, "rewards/MultiModalAccuracyORM": 0.13333333805203437, "step": 2200, "train_speed(iter/s)": 0.032493 }, { "clip_ratio": 0.0, "completion_length": 432.85, "epoch": 0.8909090909090909, "grad_norm": 0.5940318264351889, "kl": 0.017311586905270814, "learning_rate": 2e-07, "loss": 0.029576906561851503, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.05000000149011612, "reward_std": 0.13558491468429565, "rewards/MultiModalAccuracyORM": 0.05000000149011612, "step": 2205, "train_speed(iter/s)": 0.03248 }, { "clip_ratio": 0.0, "completion_length": 505.3, "epoch": 0.8929292929292929, "grad_norm": 0.8492665899886147, "kl": 0.019003557693213224, "learning_rate": 2e-07, "loss": 0.07017003893852233, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.1833333395421505, "reward_std": 0.3041664451360703, "rewards/MultiModalAccuracyORM": 0.1833333395421505, "step": 2210, "train_speed(iter/s)": 0.032484 }, { "clip_ratio": 0.0, "completion_length": 240.35, "epoch": 0.8949494949494949, "grad_norm": 1.0856523861671716, "kl": 0.027655008435249328, "learning_rate": 2e-07, "loss": -0.004737144708633423, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2833333365619183, "reward_std": 0.2260383188724518, "rewards/MultiModalAccuracyORM": 0.2833333365619183, "step": 2215, "train_speed(iter/s)": 0.032493 }, { "clip_ratio": 0.0, "completion_length": 309.9, "epoch": 0.896969696969697, "grad_norm": 1.448236523400996, "kl": 0.018335943669080736, "learning_rate": 2e-07, "loss": -0.03629968166351318, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2583333380520344, "reward_std": 0.3113932520151138, "rewards/MultiModalAccuracyORM": 0.2583333380520344, "step": 2220, "train_speed(iter/s)": 0.032499 }, { "clip_ratio": 0.0, "completion_length": 249.55, "epoch": 0.898989898989899, "grad_norm": 7.847983245982503, "kl": 0.049338278640061614, "learning_rate": 2e-07, "loss": -0.000819157063961029, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.19166666865348816, "reward_std": 0.3003867596387863, "rewards/MultiModalAccuracyORM": 0.19166666865348816, "step": 2225, "train_speed(iter/s)": 0.032517 }, { "clip_ratio": 0.0, "completion_length": 299.65, "epoch": 0.901010101010101, "grad_norm": 1.8170235507507255, "kl": 0.018048797827214004, "learning_rate": 2e-07, "loss": -0.07969279289245605, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.1833333395421505, "reward_std": 0.3167103588581085, "rewards/MultiModalAccuracyORM": 0.1833333395421505, "step": 2230, "train_speed(iter/s)": 0.032523 }, { "clip_ratio": 0.0, "completion_length": 233.15, "epoch": 0.9030303030303031, "grad_norm": 1.609249526669931, "kl": 0.026530137099325658, "learning_rate": 2e-07, "loss": -0.008447715640068054, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.31666667461395265, "reward_std": 0.3106628268957138, "rewards/MultiModalAccuracyORM": 0.31666667461395265, "step": 2235, "train_speed(iter/s)": 0.032541 }, { "clip_ratio": 0.0, "completion_length": 367.8, "epoch": 0.9050505050505051, "grad_norm": 0.7699373347527835, "kl": 0.021872828295454384, "learning_rate": 2e-07, "loss": 0.034666317701339724, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.15833334028720855, "reward_std": 0.24487241208553315, "rewards/MultiModalAccuracyORM": 0.15833334028720855, "step": 2240, "train_speed(iter/s)": 0.032555 }, { "clip_ratio": 0.0, "completion_length": 411.45, "epoch": 0.907070707070707, "grad_norm": 1.464770865938276, "kl": 0.018026039376854895, "learning_rate": 2e-07, "loss": 0.048737600445747375, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.17500000596046447, "reward_std": 0.25866150557994844, "rewards/MultiModalAccuracyORM": 0.17500000596046447, "step": 2245, "train_speed(iter/s)": 0.03256 }, { "epoch": 0.9090909090909091, "grad_norm": 1.7098872390259656, "learning_rate": 2e-07, "loss": 0.09721781015396118, "memory(GiB)": 67.41, "step": 2250, "train_speed(iter/s)": 0.032576 }, { "epoch": 0.9090909090909091, "eval_clip_ratio": 0.0, "eval_completion_length": 274.43000770568847, "eval_kl": 0.027916996125131845, "eval_loss": 0.03821293264627457, "eval_response_clip_ratio": 0.0, "eval_reward": 0.3183333395421505, "eval_reward_std": 0.3168588674068451, "eval_rewards/MultiModalAccuracyORM": 0.3183333395421505, "eval_runtime": 541.6774, "eval_samples_per_second": 0.092, "eval_steps_per_second": 0.009, "step": 2250 }, { "clip_ratio": 0.0, "completion_length": 295.35, "epoch": 0.9111111111111111, "grad_norm": 2.547618884699712, "kl": 0.022893223259598017, "learning_rate": 2e-07, "loss": 0.025194990634918212, "memory(GiB)": 67.41, "response_clip_ratio": 0.025, "reward": 0.4291666805744171, "reward_std": 0.3938093319535255, "rewards/MultiModalAccuracyORM": 0.4291666805744171, "step": 2255, "train_speed(iter/s)": 0.032148 }, { "clip_ratio": 0.0, "completion_length": 214.35, "epoch": 0.9131313131313131, "grad_norm": 1.5269748079222623, "kl": 0.08079174058511854, "learning_rate": 2e-07, "loss": 0.030228087306022645, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.40833334252238274, "reward_std": 0.21374863088130952, "rewards/MultiModalAccuracyORM": 0.40833334252238274, "step": 2260, "train_speed(iter/s)": 0.032172 }, { "clip_ratio": 0.0, "completion_length": 327.15, "epoch": 0.9151515151515152, "grad_norm": 1.4348651198084164, "kl": 0.022139840014278888, "learning_rate": 2e-07, "loss": -0.012193611264228821, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.32500001341104506, "reward_std": 0.4274616837501526, "rewards/MultiModalAccuracyORM": 0.32500001341104506, "step": 2265, "train_speed(iter/s)": 0.032191 }, { "clip_ratio": 0.0, "completion_length": 374.9, "epoch": 0.9171717171717172, "grad_norm": 0.05197357590854231, "kl": 0.021040867920964955, "learning_rate": 2e-07, "loss": 0.011036497354507447, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.11666666939854622, "reward_std": 0.0966599702835083, "rewards/MultiModalAccuracyORM": 0.11666666939854622, "step": 2270, "train_speed(iter/s)": 0.0322 }, { "clip_ratio": 0.0, "completion_length": 306.75, "epoch": 0.9191919191919192, "grad_norm": 0.4699960941829414, "kl": 0.0629787240177393, "learning_rate": 2e-07, "loss": 0.06589013934135438, "memory(GiB)": 67.41, "response_clip_ratio": 0.05, "reward": 0.30833334252238276, "reward_std": 0.2877832442522049, "rewards/MultiModalAccuracyORM": 0.30833334252238276, "step": 2275, "train_speed(iter/s)": 0.03221 }, { "clip_ratio": 0.0, "completion_length": 314.9, "epoch": 0.9212121212121213, "grad_norm": 1.3008238080628904, "kl": 0.026494850823655724, "learning_rate": 2e-07, "loss": -0.00757303386926651, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.3833333425223827, "reward_std": 0.33621527552604674, "rewards/MultiModalAccuracyORM": 0.3833333425223827, "step": 2280, "train_speed(iter/s)": 0.032219 }, { "clip_ratio": 0.0, "completion_length": 257.8, "epoch": 0.9232323232323232, "grad_norm": 1.524415453755262, "kl": 0.017740064300596714, "learning_rate": 2e-07, "loss": 0.04480080604553223, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.21666667014360427, "reward_std": 0.2940850019454956, "rewards/MultiModalAccuracyORM": 0.21666667014360427, "step": 2285, "train_speed(iter/s)": 0.032228 }, { "clip_ratio": 0.0, "completion_length": 344.9, "epoch": 0.9252525252525252, "grad_norm": 1.1770888642120725, "kl": 0.020085165183991192, "learning_rate": 2e-07, "loss": 0.003685349225997925, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.30000001192092896, "reward_std": 0.394838485121727, "rewards/MultiModalAccuracyORM": 0.30000001192092896, "step": 2290, "train_speed(iter/s)": 0.032233 }, { "clip_ratio": 0.0, "completion_length": 282.95, "epoch": 0.9272727272727272, "grad_norm": 0.6172113323574312, "kl": 0.025718586426228283, "learning_rate": 2e-07, "loss": -0.038769152760505673, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.15000000223517418, "reward_std": 0.21378422081470488, "rewards/MultiModalAccuracyORM": 0.15000000223517418, "step": 2295, "train_speed(iter/s)": 0.032258 }, { "clip_ratio": 0.0, "completion_length": 381.15, "epoch": 0.9292929292929293, "grad_norm": 0.9460784361294594, "kl": 0.02194049907848239, "learning_rate": 2e-07, "loss": 0.003476354479789734, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.20000000596046447, "reward_std": 0.2918527454137802, "rewards/MultiModalAccuracyORM": 0.20000000596046447, "step": 2300, "train_speed(iter/s)": 0.032265 }, { "clip_ratio": 0.0, "completion_length": 288.1, "epoch": 0.9313131313131313, "grad_norm": 0.531311987853284, "kl": 0.01724575264379382, "learning_rate": 2e-07, "loss": 0.026836919784545898, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.21666666939854623, "reward_std": 0.3141853272914886, "rewards/MultiModalAccuracyORM": 0.21666666939854623, "step": 2305, "train_speed(iter/s)": 0.03226 }, { "clip_ratio": 0.0, "completion_length": 307.25, "epoch": 0.9333333333333333, "grad_norm": 0.4613731219877519, "kl": 0.019346164539456367, "learning_rate": 2e-07, "loss": -0.0063665717840194706, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.23333334252238275, "reward_std": 0.22979399859905242, "rewards/MultiModalAccuracyORM": 0.23333334252238275, "step": 2310, "train_speed(iter/s)": 0.032264 }, { "clip_ratio": 0.0, "completion_length": 319.95, "epoch": 0.9353535353535354, "grad_norm": 0.5351247883145774, "kl": 0.036143379751592875, "learning_rate": 2e-07, "loss": 0.0059957727789878845, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.34166666939854623, "reward_std": 0.273775514960289, "rewards/MultiModalAccuracyORM": 0.34166666939854623, "step": 2315, "train_speed(iter/s)": 0.032281 }, { "clip_ratio": 0.0, "completion_length": 301.95, "epoch": 0.9373737373737374, "grad_norm": 1.006473375720635, "kl": 0.022489387728273868, "learning_rate": 2e-07, "loss": -0.02273874878883362, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.23333333805203438, "reward_std": 0.2651819050312042, "rewards/MultiModalAccuracyORM": 0.23333333805203438, "step": 2320, "train_speed(iter/s)": 0.032281 }, { "clip_ratio": 0.0, "completion_length": 396.75, "epoch": 0.9393939393939394, "grad_norm": 1.0081193199484457, "kl": 0.01994982697069645, "learning_rate": 2e-07, "loss": 0.06933374404907226, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.3083333380520344, "reward_std": 0.4342752188444138, "rewards/MultiModalAccuracyORM": 0.3083333380520344, "step": 2325, "train_speed(iter/s)": 0.032295 }, { "clip_ratio": 0.0, "completion_length": 259.55, "epoch": 0.9414141414141414, "grad_norm": 0.05459849756184899, "kl": 0.022601721994578838, "learning_rate": 2e-07, "loss": 0.018863174319267272, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2833333440124989, "reward_std": 0.2410811483860016, "rewards/MultiModalAccuracyORM": 0.2833333440124989, "step": 2330, "train_speed(iter/s)": 0.032314 }, { "clip_ratio": 0.0, "completion_length": 348.8, "epoch": 0.9434343434343434, "grad_norm": 1.710345471585661, "kl": 0.020212457934394478, "learning_rate": 2e-07, "loss": 0.10661859512329101, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.23333334177732468, "reward_std": 0.244369775056839, "rewards/MultiModalAccuracyORM": 0.23333334177732468, "step": 2335, "train_speed(iter/s)": 0.032312 }, { "clip_ratio": 0.0, "completion_length": 266.35, "epoch": 0.9454545454545454, "grad_norm": 1.207560090272438, "kl": 0.020538910292088985, "learning_rate": 2e-07, "loss": 0.00968976616859436, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.11666666939854622, "reward_std": 0.21149236261844634, "rewards/MultiModalAccuracyORM": 0.11666666939854622, "step": 2340, "train_speed(iter/s)": 0.032319 }, { "clip_ratio": 0.0, "completion_length": 232.7, "epoch": 0.9474747474747475, "grad_norm": 0.5661864643216962, "kl": 0.02179541252553463, "learning_rate": 2e-07, "loss": 0.03451942503452301, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2833333410322666, "reward_std": 0.3026406019926071, "rewards/MultiModalAccuracyORM": 0.2833333410322666, "step": 2345, "train_speed(iter/s)": 0.032337 }, { "clip_ratio": 0.0, "completion_length": 306.85, "epoch": 0.9494949494949495, "grad_norm": 0.6878755464268733, "kl": 0.02421591989696026, "learning_rate": 2e-07, "loss": 0.05174432992935181, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.24166667088866234, "reward_std": 0.279270276427269, "rewards/MultiModalAccuracyORM": 0.24166667088866234, "step": 2350, "train_speed(iter/s)": 0.032341 }, { "clip_ratio": 0.0, "completion_length": 352.45, "epoch": 0.9515151515151515, "grad_norm": 1.50264742942701, "kl": 0.02172108683735132, "learning_rate": 2e-07, "loss": -0.019475644826889037, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.3416666753590107, "reward_std": 0.42271838188171384, "rewards/MultiModalAccuracyORM": 0.3416666753590107, "step": 2355, "train_speed(iter/s)": 0.032343 }, { "clip_ratio": 0.0, "completion_length": 261.65, "epoch": 0.9535353535353536, "grad_norm": 1.224629573838151, "kl": 0.03754720762372017, "learning_rate": 2e-07, "loss": 0.08559540510177613, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.5083333402872086, "reward_std": 0.2744703501462936, "rewards/MultiModalAccuracyORM": 0.5083333402872086, "step": 2360, "train_speed(iter/s)": 0.032358 }, { "clip_ratio": 0.0, "completion_length": 343.5, "epoch": 0.9555555555555556, "grad_norm": 1.4630602233812633, "kl": 0.021476354077458383, "learning_rate": 2e-07, "loss": -0.08055483698844909, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.4083333395421505, "reward_std": 0.40558778643608095, "rewards/MultiModalAccuracyORM": 0.4083333395421505, "step": 2365, "train_speed(iter/s)": 0.03237 }, { "clip_ratio": 0.0, "completion_length": 345.1, "epoch": 0.9575757575757575, "grad_norm": 2.857198218490812, "kl": 0.021784471347928047, "learning_rate": 2e-07, "loss": -0.02553858757019043, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.29166667982935907, "reward_std": 0.35789157152175904, "rewards/MultiModalAccuracyORM": 0.29166667982935907, "step": 2370, "train_speed(iter/s)": 0.03238 }, { "clip_ratio": 0.0, "completion_length": 273.85, "epoch": 0.9595959595959596, "grad_norm": 1.6238009179768014, "kl": 0.03373121190816164, "learning_rate": 2e-07, "loss": 0.0017300590872764588, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.27500000670552255, "reward_std": 0.40231128334999083, "rewards/MultiModalAccuracyORM": 0.27500000670552255, "step": 2375, "train_speed(iter/s)": 0.032396 }, { "clip_ratio": 0.0, "completion_length": 372.95, "epoch": 0.9616161616161616, "grad_norm": 0.5732843040643582, "kl": 0.020048757642507554, "learning_rate": 2e-07, "loss": 0.03029699921607971, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.45000001341104506, "reward_std": 0.3471368789672852, "rewards/MultiModalAccuracyORM": 0.45000001341104506, "step": 2380, "train_speed(iter/s)": 0.032405 }, { "clip_ratio": 0.0, "completion_length": 289.35, "epoch": 0.9636363636363636, "grad_norm": 1.1293323918314548, "kl": 0.024607629235833883, "learning_rate": 2e-07, "loss": 0.0067908987402915955, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.25000000223517416, "reward_std": 0.3292373031377792, "rewards/MultiModalAccuracyORM": 0.25000000223517416, "step": 2385, "train_speed(iter/s)": 0.032423 }, { "clip_ratio": 0.0, "completion_length": 162.45, "epoch": 0.9656565656565657, "grad_norm": 1.039492421088282, "kl": 0.025942530203610658, "learning_rate": 2e-07, "loss": 0.059793722629547116, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2750000089406967, "reward_std": 0.3285214215517044, "rewards/MultiModalAccuracyORM": 0.2750000089406967, "step": 2390, "train_speed(iter/s)": 0.032437 }, { "clip_ratio": 0.0, "completion_length": 263.3, "epoch": 0.9676767676767677, "grad_norm": 1.2125676700182542, "kl": 0.024488268233835698, "learning_rate": 2e-07, "loss": 0.05103383660316467, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.3750000074505806, "reward_std": 0.39961439967155454, "rewards/MultiModalAccuracyORM": 0.3750000074505806, "step": 2395, "train_speed(iter/s)": 0.032441 }, { "clip_ratio": 0.0, "completion_length": 284.4, "epoch": 0.9696969696969697, "grad_norm": 0.09761626886432172, "kl": 0.02428424544632435, "learning_rate": 2e-07, "loss": 0.00439504086971283, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.1500000014901161, "reward_std": 0.2260383188724518, "rewards/MultiModalAccuracyORM": 0.1500000014901161, "step": 2400, "train_speed(iter/s)": 0.03246 }, { "clip_ratio": 0.0, "completion_length": 199.25, "epoch": 0.9717171717171718, "grad_norm": 2.1810913319134038, "kl": 0.03959659710526466, "learning_rate": 2e-07, "loss": 0.02262794375419617, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.26666666865348815, "reward_std": 0.21753989458084105, "rewards/MultiModalAccuracyORM": 0.26666666865348815, "step": 2405, "train_speed(iter/s)": 0.032482 }, { "clip_ratio": 0.0, "completion_length": 295.15, "epoch": 0.9737373737373738, "grad_norm": 1.7428069378966462, "kl": 0.02672185152769089, "learning_rate": 2e-07, "loss": -0.00538158118724823, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.13333333879709244, "reward_std": 0.2323044866323471, "rewards/MultiModalAccuracyORM": 0.13333333879709244, "step": 2410, "train_speed(iter/s)": 0.032487 }, { "clip_ratio": 0.0, "completion_length": 257.5, "epoch": 0.9757575757575757, "grad_norm": 0.7544250223135323, "kl": 0.027272729855030774, "learning_rate": 2e-07, "loss": 0.03573228120803833, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.40833334624767303, "reward_std": 0.27523933053016664, "rewards/MultiModalAccuracyORM": 0.40833334624767303, "step": 2415, "train_speed(iter/s)": 0.032504 }, { "clip_ratio": 0.0, "completion_length": 180.15, "epoch": 0.9777777777777777, "grad_norm": 1.866177647075409, "kl": 0.036007688101381066, "learning_rate": 2e-07, "loss": 0.003173720836639404, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.45000000596046447, "reward_std": 0.3689336538314819, "rewards/MultiModalAccuracyORM": 0.45000000596046447, "step": 2420, "train_speed(iter/s)": 0.032526 }, { "clip_ratio": 0.0, "completion_length": 206.6, "epoch": 0.9797979797979798, "grad_norm": 1.2725371241295251, "kl": 0.02483037244528532, "learning_rate": 2e-07, "loss": -0.025305929780006408, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.14166667237877845, "reward_std": 0.21374863088130952, "rewards/MultiModalAccuracyORM": 0.14166667237877845, "step": 2425, "train_speed(iter/s)": 0.032544 }, { "clip_ratio": 0.0, "completion_length": 267.65, "epoch": 0.9818181818181818, "grad_norm": 0.5577365367508015, "kl": 0.037827163096517326, "learning_rate": 2e-07, "loss": 0.09373842477798462, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.4333333417773247, "reward_std": 0.38300099074840543, "rewards/MultiModalAccuracyORM": 0.4333333417773247, "step": 2430, "train_speed(iter/s)": 0.032563 }, { "clip_ratio": 0.0, "completion_length": 241.65, "epoch": 0.9838383838383838, "grad_norm": 2.1936674243123027, "kl": 0.02673042882233858, "learning_rate": 2e-07, "loss": -0.004708817601203919, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2500000074505806, "reward_std": 0.2712294369935989, "rewards/MultiModalAccuracyORM": 0.2500000074505806, "step": 2435, "train_speed(iter/s)": 0.032575 }, { "clip_ratio": 0.0, "completion_length": 242.1, "epoch": 0.9858585858585859, "grad_norm": 1.2081108167008505, "kl": 0.03199390545487404, "learning_rate": 2e-07, "loss": 0.08453056812286378, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.17500000447034836, "reward_std": 0.2323400765657425, "rewards/MultiModalAccuracyORM": 0.17500000447034836, "step": 2440, "train_speed(iter/s)": 0.032589 }, { "clip_ratio": 0.0, "completion_length": 212.6, "epoch": 0.9878787878787879, "grad_norm": 1.3157103877648637, "kl": 0.033728963509202, "learning_rate": 2e-07, "loss": 0.025054675340652467, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.35000001192092894, "reward_std": 0.323036128282547, "rewards/MultiModalAccuracyORM": 0.35000001192092894, "step": 2445, "train_speed(iter/s)": 0.032609 }, { "clip_ratio": 0.0, "completion_length": 280.85, "epoch": 0.98989898989899, "grad_norm": 0.07041901111619411, "kl": 0.029867130145430566, "learning_rate": 2e-07, "loss": 0.008186718821525574, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.1250000037252903, "reward_std": 0.21550226211547852, "rewards/MultiModalAccuracyORM": 0.1250000037252903, "step": 2450, "train_speed(iter/s)": 0.032618 }, { "clip_ratio": 0.0, "completion_length": 292.0, "epoch": 0.9919191919191919, "grad_norm": 1.672464403846531, "kl": 0.032038337737321856, "learning_rate": 2e-07, "loss": -0.020856915414333342, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2916666708886623, "reward_std": 0.28227151930332184, "rewards/MultiModalAccuracyORM": 0.2916666708886623, "step": 2455, "train_speed(iter/s)": 0.032636 }, { "clip_ratio": 0.0, "completion_length": 310.5, "epoch": 0.9939393939393939, "grad_norm": 1.1129720769248492, "kl": 0.02438914030790329, "learning_rate": 2e-07, "loss": 0.06876440644264221, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.450000012665987, "reward_std": 0.33704385757446287, "rewards/MultiModalAccuracyORM": 0.450000012665987, "step": 2460, "train_speed(iter/s)": 0.032645 }, { "clip_ratio": 0.0, "completion_length": 221.7, "epoch": 0.9959595959595959, "grad_norm": 1.6345111959053422, "kl": 0.03214533980935812, "learning_rate": 2e-07, "loss": 0.008031123876571655, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.5250000104308128, "reward_std": 0.3142238765954971, "rewards/MultiModalAccuracyORM": 0.5250000104308128, "step": 2465, "train_speed(iter/s)": 0.032661 }, { "clip_ratio": 0.0, "completion_length": 246.95, "epoch": 0.997979797979798, "grad_norm": 1.663569908963529, "kl": 0.0534836488775909, "learning_rate": 2e-07, "loss": 0.05174955129623413, "memory(GiB)": 67.41, "response_clip_ratio": 0.0, "reward": 0.2583333417773247, "reward_std": 0.33303395807743075, "rewards/MultiModalAccuracyORM": 0.2583333417773247, "step": 2470, "train_speed(iter/s)": 0.032676 }, { "epoch": 1.0, "grad_norm": 1.93613237990153, "learning_rate": 2e-07, "loss": -0.011667436361312867, "memory(GiB)": 67.41, "step": 2475, "train_speed(iter/s)": 0.03268 }, { "epoch": 1.0, "eval_clip_ratio": 0.0, "eval_completion_length": 257.6600067901611, "eval_kl": 0.04240348171442747, "eval_loss": -0.001080758636817336, "eval_response_clip_ratio": 0.003333333432674408, "eval_reward": 0.32666667327284815, "eval_reward_std": 0.29233356595039367, "eval_rewards/MultiModalAccuracyORM": 0.32666667327284815, "eval_runtime": 586.4987, "eval_samples_per_second": 0.085, "eval_steps_per_second": 0.009, "step": 2475 } ], "logging_steps": 5, "max_steps": 2475, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 3, "trial_name": null, "trial_params": null }