{ "best_global_step": 2475, "best_metric": 0.3483333396911621, "best_model_checkpoint": "/mnt/data/user/zhao_jun/tangjixin/output/model/intern3vl-8b-grpo_v2/v19-20250430-174625/checkpoint-2475", "epoch": 1.0, "eval_steps": 250, "global_step": 2475, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.5, "completions/mean_length": 292.2916717529297, "completions/min_length": 175.5, "epoch": 0.00040404040404040404, "grad_norm": 2.6534149601732357, "kl": 0.00283050537109375, "learning_rate": 1.6129032258064515e-09, "loss": 0.04529620707035065, "memory(GiB)": 92.98, "reward": 0.2083333395421505, "reward_std": 0.3905205577611923, "rewards/MultiModalAccuracyORM/mean": 0.2083333395421505, "rewards/MultiModalAccuracyORM/std": 0.3905205577611923, "step": 1, "train_speed(iter/s)": 0.011973 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.25, "completions/mean_length": 238.60417366027832, "completions/min_length": 109.75, "epoch": 0.00202020202020202, "grad_norm": 1.7382476360832968, "kl": 0.004979610443115234, "learning_rate": 8.064516129032257e-09, "loss": 0.005735308863222599, "memory(GiB)": 104.19, "reward": 0.18750000558793545, "reward_std": 0.1695556379854679, "rewards/MultiModalAccuracyORM/mean": 0.18750000558793545, "rewards/MultiModalAccuracyORM/std": 0.1695556379854679, "step": 5, "train_speed(iter/s)": 0.026061 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 931.8, "completions/mean_length": 493.87501831054686, "completions/min_length": 266.1, "epoch": 0.00404040404040404, "grad_norm": 1.6461868811442486, "kl": 0.0029445648193359374, "learning_rate": 1.6129032258064514e-08, "loss": 0.02294178307056427, "memory(GiB)": 104.37, "reward": 0.22500000819563865, "reward_std": 0.308176326751709, "rewards/MultiModalAccuracyORM/mean": 0.22500000819563865, "rewards/MultiModalAccuracyORM/std": 0.308176326751709, "step": 10, "train_speed(iter/s)": 0.027382 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.8, "completions/mean_length": 231.4250061035156, "completions/min_length": 144.3, "epoch": 0.006060606060606061, "grad_norm": 3.6175414067372516, "kl": 0.0058765411376953125, "learning_rate": 2.4193548387096773e-08, "loss": -0.020487520098686218, "memory(GiB)": 107.13, "reward": 0.4250000178813934, "reward_std": 0.37195889055728915, "rewards/MultiModalAccuracyORM/mean": 0.4250000178813934, "rewards/MultiModalAccuracyORM/std": 0.37195889055728915, "step": 15, "train_speed(iter/s)": 0.031173 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/mean_length": 374.85834045410155, "completions/min_length": 234.0, "epoch": 0.00808080808080808, "grad_norm": 2.0453002988188924, "kl": 0.0025386810302734375, "learning_rate": 3.225806451612903e-08, "loss": 0.018081194162368773, "memory(GiB)": 110.66, "reward": 0.2833333373069763, "reward_std": 0.2855865716934204, "rewards/MultiModalAccuracyORM/mean": 0.2833333373069763, "rewards/MultiModalAccuracyORM/std": 0.2855865716934204, "step": 20, "train_speed(iter/s)": 0.032111 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 675.3, "completions/mean_length": 343.33334197998045, "completions/min_length": 163.6, "epoch": 0.010101010101010102, "grad_norm": 2.0297666321727066, "kl": 0.005942535400390625, "learning_rate": 4.032258064516129e-08, "loss": -0.003527432680130005, "memory(GiB)": 110.66, "reward": 0.26666667982935904, "reward_std": 0.3784792721271515, "rewards/MultiModalAccuracyORM/mean": 0.26666667982935904, "rewards/MultiModalAccuracyORM/std": 0.3784792721271515, "step": 25, "train_speed(iter/s)": 0.03346 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.7, "completions/mean_length": 279.9750091552734, "completions/min_length": 170.9, "epoch": 0.012121212121212121, "grad_norm": 1.580858331896628, "kl": 0.0038494110107421876, "learning_rate": 4.8387096774193546e-08, "loss": -0.00242428183555603, "memory(GiB)": 110.68, "reward": 0.10000000298023223, "reward_std": 0.2711698323488235, "rewards/MultiModalAccuracyORM/mean": 0.10000000298023223, "rewards/MultiModalAccuracyORM/std": 0.2711698323488235, "step": 30, "train_speed(iter/s)": 0.034153 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.8, "completions/mean_length": 286.36667404174807, "completions/min_length": 165.1, "epoch": 0.014141414141414142, "grad_norm": 1.8379975346697042, "kl": 0.02647857666015625, "learning_rate": 5.645161290322581e-08, "loss": 0.00997340977191925, "memory(GiB)": 110.68, "reward": 0.25000000521540644, "reward_std": 0.2200503796339035, "rewards/MultiModalAccuracyORM/mean": 0.25000000521540644, "rewards/MultiModalAccuracyORM/std": 0.2200503796339035, "step": 35, "train_speed(iter/s)": 0.034524 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 792.7, "completions/mean_length": 407.9500198364258, "completions/min_length": 231.7, "epoch": 0.01616161616161616, "grad_norm": 1.879368475551475, "kl": 0.00126495361328125, "learning_rate": 6.451612903225806e-08, "loss": 0.005544811487197876, "memory(GiB)": 111.72, "reward": 0.16666667014360428, "reward_std": 0.32451151907444, "rewards/MultiModalAccuracyORM/mean": 0.16666667014360428, "rewards/MultiModalAccuracyORM/std": 0.32451151907444, "step": 40, "train_speed(iter/s)": 0.034576 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.7, "completions/mean_length": 326.12501068115233, "completions/min_length": 189.6, "epoch": 0.01818181818181818, "grad_norm": 0.7460899635365059, "kl": 0.0039581298828125, "learning_rate": 7.258064516129032e-08, "loss": 0.006708705425262451, "memory(GiB)": 111.74, "reward": 0.2083333395421505, "reward_std": 0.22406027615070342, "rewards/MultiModalAccuracyORM/mean": 0.2083333395421505, "rewards/MultiModalAccuracyORM/std": 0.22406027615070342, "step": 45, "train_speed(iter/s)": 0.034933 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.3, "completions/mean_length": 274.28333892822263, "completions/min_length": 131.9, "epoch": 0.020202020202020204, "grad_norm": 2.4079312295812714, "kl": 0.00251922607421875, "learning_rate": 8.064516129032257e-08, "loss": 0.015183356404304505, "memory(GiB)": 111.74, "reward": 0.21666667386889457, "reward_std": 0.25738072395324707, "rewards/MultiModalAccuracyORM/mean": 0.21666667386889457, "rewards/MultiModalAccuracyORM/std": 0.25738072395324707, "step": 50, "train_speed(iter/s)": 0.035232 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 643.5, "completions/mean_length": 365.0666778564453, "completions/min_length": 191.6, "epoch": 0.022222222222222223, "grad_norm": 0.014705836185752576, "kl": 0.004721450805664063, "learning_rate": 8.870967741935484e-08, "loss": 0.01203818917274475, "memory(GiB)": 111.74, "reward": 0.32500001043081284, "reward_std": 0.3044206529855728, "rewards/MultiModalAccuracyORM/mean": 0.32500001043081284, "rewards/MultiModalAccuracyORM/std": 0.3044206529855728, "step": 55, "train_speed(iter/s)": 0.035135 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.3, "completions/mean_length": 338.5833435058594, "completions/min_length": 199.7, "epoch": 0.024242424242424242, "grad_norm": 2.6954085340696765, "kl": 0.0020017623901367188, "learning_rate": 9.677419354838709e-08, "loss": -0.005992072820663452, "memory(GiB)": 111.74, "reward": 0.18333333507180213, "reward_std": 0.33354574739933013, "rewards/MultiModalAccuracyORM/mean": 0.18333333507180213, "rewards/MultiModalAccuracyORM/std": 0.33354574739933013, "step": 60, "train_speed(iter/s)": 0.035177 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.8, "completions/mean_length": 363.6166793823242, "completions/min_length": 208.5, "epoch": 0.026262626262626262, "grad_norm": 3.0115754925592952, "kl": 0.0037433624267578123, "learning_rate": 1.0483870967741934e-07, "loss": -0.03836339712142944, "memory(GiB)": 111.74, "reward": 0.2666666738688946, "reward_std": 0.4085534304380417, "rewards/MultiModalAccuracyORM/mean": 0.2666666738688946, "rewards/MultiModalAccuracyORM/std": 0.4085534304380417, "step": 65, "train_speed(iter/s)": 0.035437 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.2, "completions/mean_length": 377.9750099182129, "completions/min_length": 204.0, "epoch": 0.028282828282828285, "grad_norm": 1.7279437509176054, "kl": 0.001779937744140625, "learning_rate": 1.1290322580645162e-07, "loss": -0.05415753722190857, "memory(GiB)": 111.74, "reward": 0.3000000074505806, "reward_std": 0.30035116374492643, "rewards/MultiModalAccuracyORM/mean": 0.3000000074505806, "rewards/MultiModalAccuracyORM/std": 0.30035116374492643, "step": 70, "train_speed(iter/s)": 0.035665 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.6, "completions/mean_length": 242.45834197998047, "completions/min_length": 116.8, "epoch": 0.030303030303030304, "grad_norm": 3.0031072335906597, "kl": 0.002858734130859375, "learning_rate": 1.2096774193548387e-07, "loss": 0.03029954433441162, "memory(GiB)": 111.74, "reward": 0.26666667237877845, "reward_std": 0.36043521761894226, "rewards/MultiModalAccuracyORM/mean": 0.26666667237877845, "rewards/MultiModalAccuracyORM/std": 0.36043521761894226, "step": 75, "train_speed(iter/s)": 0.036005 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.016666666666666666, "completions/max_length": 776.6, "completions/mean_length": 435.741682434082, "completions/min_length": 231.3, "epoch": 0.03232323232323232, "grad_norm": 0.42303978897841893, "kl": 0.0016681671142578125, "learning_rate": 1.2903225806451611e-07, "loss": 0.049380439519882205, "memory(GiB)": 111.74, "reward": 0.325000012665987, "reward_std": 0.3008513689041138, "rewards/MultiModalAccuracyORM/mean": 0.325000012665987, "rewards/MultiModalAccuracyORM/std": 0.3008513689041138, "step": 80, "train_speed(iter/s)": 0.035635 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.1, "completions/mean_length": 302.8333435058594, "completions/min_length": 166.0, "epoch": 0.03434343434343434, "grad_norm": 2.6438328703498097, "kl": 0.00451507568359375, "learning_rate": 1.3709677419354838e-07, "loss": -0.0442815363407135, "memory(GiB)": 111.74, "reward": 0.2833333402872086, "reward_std": 0.3933126300573349, "rewards/MultiModalAccuracyORM/mean": 0.2833333402872086, "rewards/MultiModalAccuracyORM/std": 0.3933126300573349, "step": 85, "train_speed(iter/s)": 0.035979 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 620.0, "completions/mean_length": 381.02501525878904, "completions/min_length": 183.3, "epoch": 0.03636363636363636, "grad_norm": 1.74840980915549, "kl": 0.0013660430908203126, "learning_rate": 1.4516129032258064e-07, "loss": 0.07182409167289734, "memory(GiB)": 111.74, "reward": 0.30000000521540643, "reward_std": 0.35937642157077787, "rewards/MultiModalAccuracyORM/mean": 0.30000000521540643, "rewards/MultiModalAccuracyORM/std": 0.35937642157077787, "step": 90, "train_speed(iter/s)": 0.035659 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 641.2, "completions/mean_length": 325.55834197998047, "completions/min_length": 170.8, "epoch": 0.03838383838383838, "grad_norm": 0.04177816415582162, "kl": 0.014581298828125, "learning_rate": 1.5322580645161288e-07, "loss": 0.029976147413253783, "memory(GiB)": 111.74, "reward": 0.18333333879709243, "reward_std": 0.2358713388442993, "rewards/MultiModalAccuracyORM/mean": 0.18333333879709243, "rewards/MultiModalAccuracyORM/std": 0.2358713388442993, "step": 95, "train_speed(iter/s)": 0.035533 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.2, "completions/mean_length": 339.98334045410155, "completions/min_length": 187.8, "epoch": 0.04040404040404041, "grad_norm": 3.190540630566101, "kl": 0.004257583618164062, "learning_rate": 1.6129032258064515e-07, "loss": 0.0416176974773407, "memory(GiB)": 111.74, "reward": 0.28333334252238274, "reward_std": 0.3247897386550903, "rewards/MultiModalAccuracyORM/mean": 0.28333334252238274, "rewards/MultiModalAccuracyORM/std": 0.3247897386550903, "step": 100, "train_speed(iter/s)": 0.035677 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.008333333333333333, "completions/max_length": 589.7, "completions/mean_length": 345.52500610351564, "completions/min_length": 173.9, "epoch": 0.04242424242424243, "grad_norm": 3.073635935584006, "kl": 0.00194549560546875, "learning_rate": 1.6935483870967741e-07, "loss": 0.042548298835754395, "memory(GiB)": 111.74, "reward": 0.2000000111758709, "reward_std": 0.2611959934234619, "rewards/MultiModalAccuracyORM/mean": 0.2000000111758709, "rewards/MultiModalAccuracyORM/std": 0.2611959934234619, "step": 105, "train_speed(iter/s)": 0.035468 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.008333333333333333, "completions/max_length": 745.5, "completions/mean_length": 380.5166748046875, "completions/min_length": 225.9, "epoch": 0.044444444444444446, "grad_norm": 0.9626100429708261, "kl": 0.0016246795654296874, "learning_rate": 1.7741935483870968e-07, "loss": -0.02766646146774292, "memory(GiB)": 111.74, "reward": 0.1916666731238365, "reward_std": 0.3073477536439896, "rewards/MultiModalAccuracyORM/mean": 0.1916666731238365, "rewards/MultiModalAccuracyORM/std": 0.3073477536439896, "step": 110, "train_speed(iter/s)": 0.035455 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/mean_length": 311.90000915527344, "completions/min_length": 154.2, "epoch": 0.046464646464646465, "grad_norm": 1.342836390340581, "kl": 0.008540725708007813, "learning_rate": 1.8548387096774192e-07, "loss": -0.010879068076610566, "memory(GiB)": 111.74, "reward": 0.10000000074505806, "reward_std": 0.22228264510631562, "rewards/MultiModalAccuracyORM/mean": 0.10000000074505806, "rewards/MultiModalAccuracyORM/std": 0.22228264510631562, "step": 115, "train_speed(iter/s)": 0.035535 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/mean_length": 288.9583435058594, "completions/min_length": 165.6, "epoch": 0.048484848484848485, "grad_norm": 2.6619939115206135, "kl": 0.00256195068359375, "learning_rate": 1.9354838709677418e-07, "loss": 0.033258992433547976, "memory(GiB)": 111.74, "reward": 0.4083333469927311, "reward_std": 0.40963622033596037, "rewards/MultiModalAccuracyORM/mean": 0.4083333469927311, "rewards/MultiModalAccuracyORM/std": 0.40963622033596037, "step": 120, "train_speed(iter/s)": 0.035724 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/mean_length": 379.45001220703125, "completions/min_length": 187.1, "epoch": 0.050505050505050504, "grad_norm": 1.321049130692736, "kl": 0.0020069122314453126, "learning_rate": 2e-07, "loss": -0.019822967052459717, "memory(GiB)": 111.74, "reward": 0.2916666708886623, "reward_std": 0.32370694279670714, "rewards/MultiModalAccuracyORM/mean": 0.2916666708886623, "rewards/MultiModalAccuracyORM/std": 0.32370694279670714, "step": 125, "train_speed(iter/s)": 0.035602 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.7, "completions/mean_length": 316.5916748046875, "completions/min_length": 171.4, "epoch": 0.052525252525252523, "grad_norm": 2.460967418512405, "kl": 0.0105987548828125, "learning_rate": 2e-07, "loss": 0.0003096837550401688, "memory(GiB)": 111.74, "reward": 0.20833333656191827, "reward_std": 0.29007510244846346, "rewards/MultiModalAccuracyORM/mean": 0.20833333656191827, "rewards/MultiModalAccuracyORM/std": 0.29007510244846346, "step": 130, "train_speed(iter/s)": 0.035448 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 666.4, "completions/mean_length": 387.5166763305664, "completions/min_length": 184.3, "epoch": 0.05454545454545454, "grad_norm": 0.059862028341158974, "kl": 0.011987686157226562, "learning_rate": 2e-07, "loss": -0.011434757709503173, "memory(GiB)": 111.74, "reward": 0.1083333358168602, "reward_std": 0.25866150557994844, "rewards/MultiModalAccuracyORM/mean": 0.1083333358168602, "rewards/MultiModalAccuracyORM/std": 0.25866150557994844, "step": 135, "train_speed(iter/s)": 0.035278 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 629.3, "completions/mean_length": 382.4166778564453, "completions/min_length": 206.3, "epoch": 0.05656565656565657, "grad_norm": 0.8204164270444702, "kl": 0.002767181396484375, "learning_rate": 2e-07, "loss": 0.004211039841175079, "memory(GiB)": 111.74, "reward": 0.27500001192092893, "reward_std": 0.2777498096227646, "rewards/MultiModalAccuracyORM/mean": 0.27500001192092893, "rewards/MultiModalAccuracyORM/std": 0.2777498096227646, "step": 140, "train_speed(iter/s)": 0.035472 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.4, "completions/mean_length": 358.13333892822266, "completions/min_length": 230.0, "epoch": 0.05858585858585859, "grad_norm": 2.288187560312466, "kl": 0.006110763549804688, "learning_rate": 2e-07, "loss": -6.483197212219239e-05, "memory(GiB)": 111.74, "reward": 0.13333334028720856, "reward_std": 0.19964569807052612, "rewards/MultiModalAccuracyORM/mean": 0.13333334028720856, "rewards/MultiModalAccuracyORM/std": 0.19964569807052612, "step": 145, "train_speed(iter/s)": 0.035406 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.8, "completions/mean_length": 361.4166763305664, "completions/min_length": 210.5, "epoch": 0.06060606060606061, "grad_norm": 0.015594201645230225, "kl": 0.015087890625, "learning_rate": 2e-07, "loss": 0.015390211343765258, "memory(GiB)": 111.74, "reward": 0.14166667237877845, "reward_std": 0.21374862194061278, "rewards/MultiModalAccuracyORM/mean": 0.14166667237877845, "rewards/MultiModalAccuracyORM/std": 0.21374862194061278, "step": 150, "train_speed(iter/s)": 0.035348 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.4, "completions/mean_length": 268.90834045410156, "completions/min_length": 145.7, "epoch": 0.06262626262626263, "grad_norm": 1.9984607447420715, "kl": 0.009865570068359374, "learning_rate": 2e-07, "loss": 0.041778740286827085, "memory(GiB)": 111.74, "reward": 0.15000000596046448, "reward_std": 0.2238060563802719, "rewards/MultiModalAccuracyORM/mean": 0.15000000596046448, "rewards/MultiModalAccuracyORM/std": 0.2238060563802719, "step": 155, "train_speed(iter/s)": 0.035429 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.6, "completions/mean_length": 307.5416748046875, "completions/min_length": 163.1, "epoch": 0.06464646464646465, "grad_norm": 1.9710039404778148, "kl": 0.0016231536865234375, "learning_rate": 2e-07, "loss": 0.06229003667831421, "memory(GiB)": 111.74, "reward": 0.2583333395421505, "reward_std": 0.35413345992565154, "rewards/MultiModalAccuracyORM/mean": 0.2583333395421505, "rewards/MultiModalAccuracyORM/std": 0.35413345992565154, "step": 160, "train_speed(iter/s)": 0.035424 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 702.8, "completions/mean_length": 392.75000915527346, "completions/min_length": 207.7, "epoch": 0.06666666666666667, "grad_norm": 1.4786377917798241, "kl": 0.009944915771484375, "learning_rate": 2e-07, "loss": 0.01215519905090332, "memory(GiB)": 111.74, "reward": 0.24166667237877845, "reward_std": 0.28784283697605134, "rewards/MultiModalAccuracyORM/mean": 0.24166667237877845, "rewards/MultiModalAccuracyORM/std": 0.28784283697605134, "step": 165, "train_speed(iter/s)": 0.035279 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.7, "completions/mean_length": 280.4750061035156, "completions/min_length": 144.3, "epoch": 0.06868686868686869, "grad_norm": 3.7940420455147077, "kl": 0.019321441650390625, "learning_rate": 2e-07, "loss": -0.022571200132369997, "memory(GiB)": 111.74, "reward": 0.30833334028720855, "reward_std": 0.365692725777626, "rewards/MultiModalAccuracyORM/mean": 0.30833334028720855, "rewards/MultiModalAccuracyORM/std": 0.365692725777626, "step": 170, "train_speed(iter/s)": 0.035381 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.2, "completions/mean_length": 346.808341217041, "completions/min_length": 159.3, "epoch": 0.0707070707070707, "grad_norm": 1.6037297839480729, "kl": 0.0017574310302734375, "learning_rate": 2e-07, "loss": 0.05014150142669678, "memory(GiB)": 111.74, "reward": 0.35000001415610316, "reward_std": 0.3534030318260193, "rewards/MultiModalAccuracyORM/mean": 0.35000001415610316, "rewards/MultiModalAccuracyORM/std": 0.3534030318260193, "step": 175, "train_speed(iter/s)": 0.035382 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/mean_length": 324.31666870117186, "completions/min_length": 202.0, "epoch": 0.07272727272727272, "grad_norm": 2.7315358529507865, "kl": 0.0067108154296875, "learning_rate": 2e-07, "loss": 0.017354550957679748, "memory(GiB)": 111.74, "reward": 0.10833333730697632, "reward_std": 0.2448128044605255, "rewards/MultiModalAccuracyORM/mean": 0.10833333730697632, "rewards/MultiModalAccuracyORM/std": 0.2448128044605255, "step": 180, "train_speed(iter/s)": 0.035416 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.3, "completions/mean_length": 270.41667861938475, "completions/min_length": 138.9, "epoch": 0.07474747474747474, "grad_norm": 2.314028672730481, "kl": 0.002983856201171875, "learning_rate": 2e-07, "loss": 0.033014419674873355, "memory(GiB)": 111.74, "reward": 0.3333333425223827, "reward_std": 0.2566834628582001, "rewards/MultiModalAccuracyORM/mean": 0.3333333425223827, "rewards/MultiModalAccuracyORM/std": 0.2566834628582001, "step": 185, "train_speed(iter/s)": 0.035387 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.6, "completions/mean_length": 341.2416763305664, "completions/min_length": 181.1, "epoch": 0.07676767676767676, "grad_norm": 2.3931438253006387, "kl": 0.00200347900390625, "learning_rate": 2e-07, "loss": 0.038839906454086304, "memory(GiB)": 111.74, "reward": 0.17500000596046447, "reward_std": 0.2684228092432022, "rewards/MultiModalAccuracyORM/mean": 0.17500000596046447, "rewards/MultiModalAccuracyORM/std": 0.2684228092432022, "step": 190, "train_speed(iter/s)": 0.03545 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.3, "completions/mean_length": 375.1000045776367, "completions/min_length": 215.9, "epoch": 0.07878787878787878, "grad_norm": 1.8630040945251685, "kl": 0.002384376525878906, "learning_rate": 2e-07, "loss": -0.015469104051589966, "memory(GiB)": 111.74, "reward": 0.1583333395421505, "reward_std": 0.27148365080356596, "rewards/MultiModalAccuracyORM/mean": 0.1583333395421505, "rewards/MultiModalAccuracyORM/std": 0.27148365080356596, "step": 195, "train_speed(iter/s)": 0.035415 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.2, "completions/mean_length": 379.8833435058594, "completions/min_length": 200.8, "epoch": 0.08080808080808081, "grad_norm": 2.200570213421646, "kl": 0.0036174774169921873, "learning_rate": 2e-07, "loss": 0.006271684169769287, "memory(GiB)": 111.74, "reward": 0.25000000447034837, "reward_std": 0.42421777844429015, "rewards/MultiModalAccuracyORM/mean": 0.25000000447034837, "rewards/MultiModalAccuracyORM/std": 0.42421777844429015, "step": 200, "train_speed(iter/s)": 0.035369 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.3, "completions/mean_length": 345.00001220703126, "completions/min_length": 174.6, "epoch": 0.08282828282828283, "grad_norm": 1.1008615802288388, "kl": 0.0024932861328125, "learning_rate": 2e-07, "loss": 0.006234277784824371, "memory(GiB)": 111.74, "reward": 0.16666667237877847, "reward_std": 0.2938547760248184, "rewards/MultiModalAccuracyORM/mean": 0.16666667237877847, "rewards/MultiModalAccuracyORM/std": 0.2938547760248184, "step": 205, "train_speed(iter/s)": 0.035338 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.3, "completions/mean_length": 269.37500762939453, "completions/min_length": 147.6, "epoch": 0.08484848484848485, "grad_norm": 3.476093319706285, "kl": 0.0026340484619140625, "learning_rate": 2e-07, "loss": -0.0015334427356719972, "memory(GiB)": 111.74, "reward": 0.25000000447034837, "reward_std": 0.300192129611969, "rewards/MultiModalAccuracyORM/mean": 0.25000000447034837, "rewards/MultiModalAccuracyORM/std": 0.300192129611969, "step": 210, "train_speed(iter/s)": 0.035464 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.8, "completions/mean_length": 285.75000762939453, "completions/min_length": 148.7, "epoch": 0.08686868686868687, "grad_norm": 2.1593026278667984, "kl": 0.006510162353515625, "learning_rate": 2e-07, "loss": -0.015721744298934935, "memory(GiB)": 111.74, "reward": 0.21666667088866234, "reward_std": 0.3470772713422775, "rewards/MultiModalAccuracyORM/mean": 0.21666667088866234, "rewards/MultiModalAccuracyORM/std": 0.3470772713422775, "step": 215, "train_speed(iter/s)": 0.035439 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.7, "completions/mean_length": 354.20001220703125, "completions/min_length": 199.6, "epoch": 0.08888888888888889, "grad_norm": 3.7456181210533077, "kl": 0.004998016357421875, "learning_rate": 2e-07, "loss": -0.02768584489822388, "memory(GiB)": 111.74, "reward": 0.28333333879709244, "reward_std": 0.28452777564525605, "rewards/MultiModalAccuracyORM/mean": 0.28333333879709244, "rewards/MultiModalAccuracyORM/std": 0.28452777564525605, "step": 220, "train_speed(iter/s)": 0.035428 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.5, "completions/mean_length": 311.5416778564453, "completions/min_length": 177.8, "epoch": 0.09090909090909091, "grad_norm": 2.0378307788473684, "kl": 0.002862548828125, "learning_rate": 2e-07, "loss": 0.003831219673156738, "memory(GiB)": 111.74, "reward": 0.4000000111758709, "reward_std": 0.3752594023942947, "rewards/MultiModalAccuracyORM/mean": 0.4000000111758709, "rewards/MultiModalAccuracyORM/std": 0.3752594023942947, "step": 225, "train_speed(iter/s)": 0.035407 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.5, "completions/mean_length": 371.4583435058594, "completions/min_length": 190.4, "epoch": 0.09292929292929293, "grad_norm": 2.1323681326918855, "kl": 0.0035661697387695313, "learning_rate": 2e-07, "loss": 0.0016314834356307983, "memory(GiB)": 111.74, "reward": 0.2083333432674408, "reward_std": 0.3477985322475433, "rewards/MultiModalAccuracyORM/mean": 0.2083333432674408, "rewards/MultiModalAccuracyORM/std": 0.3477985322475433, "step": 230, "train_speed(iter/s)": 0.035371 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/mean_length": 287.6916778564453, "completions/min_length": 168.0, "epoch": 0.09494949494949495, "grad_norm": 3.249083513364966, "kl": 0.00834503173828125, "learning_rate": 2e-07, "loss": -0.004596877098083496, "memory(GiB)": 111.74, "reward": 0.13333333730697633, "reward_std": 0.19513316750526427, "rewards/MultiModalAccuracyORM/mean": 0.13333333730697633, "rewards/MultiModalAccuracyORM/std": 0.19513316750526427, "step": 235, "train_speed(iter/s)": 0.03535 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.4, "completions/mean_length": 316.3583450317383, "completions/min_length": 173.8, "epoch": 0.09696969696969697, "grad_norm": 2.412571205764537, "kl": 0.005106735229492188, "learning_rate": 2e-07, "loss": 0.004295679926872254, "memory(GiB)": 111.74, "reward": 0.23333333879709245, "reward_std": 0.3171865612268448, "rewards/MultiModalAccuracyORM/mean": 0.23333333879709245, "rewards/MultiModalAccuracyORM/std": 0.3171865612268448, "step": 240, "train_speed(iter/s)": 0.035314 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.3, "completions/mean_length": 298.17500457763674, "completions/min_length": 166.5, "epoch": 0.09898989898989899, "grad_norm": 1.9493555044308044, "kl": 0.003982925415039062, "learning_rate": 2e-07, "loss": -0.04734513759613037, "memory(GiB)": 111.74, "reward": 0.2333333395421505, "reward_std": 0.3471368670463562, "rewards/MultiModalAccuracyORM/mean": 0.2333333395421505, "rewards/MultiModalAccuracyORM/std": 0.3471368670463562, "step": 245, "train_speed(iter/s)": 0.035338 }, { "epoch": 0.10101010101010101, "grad_norm": 1.3381064401700158, "learning_rate": 2e-07, "loss": -0.013491255044937134, "memory(GiB)": 111.78, "step": 250, "train_speed(iter/s)": 0.035321 }, { "epoch": 0.10101010101010101, "eval_clip_ratio": 0.0, "eval_completions/clipped_ratio": 0.0016666666666666666, "eval_completions/max_length": 567.88, "eval_completions/mean_length": 340.8433419799805, "eval_completions/min_length": 176.68, "eval_kl": 0.0008290672302246094, "eval_loss": 0.011471391655504704, "eval_reward": 0.25833333894610405, "eval_reward_std": 0.3269642275571823, "eval_rewards/MultiModalAccuracyORM/mean": 0.25833333894610405, "eval_rewards/MultiModalAccuracyORM/std": 0.3269642275571823, "eval_runtime": 589.5277, "eval_samples_per_second": 0.085, "eval_steps_per_second": 0.008, "step": 250 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.2, "completions/mean_length": 405.27917556762696, "completions/min_length": 229.2, "epoch": 0.10303030303030303, "grad_norm": 1.3096626974864818, "kl": 0.002015495300292969, "learning_rate": 2e-07, "loss": 0.022876815497875215, "memory(GiB)": 113.5, "reward": 0.21250000447034836, "reward_std": 0.2526913657784462, "rewards/MultiModalAccuracyORM/mean": 0.21250000447034836, "rewards/MultiModalAccuracyORM/std": 0.2526913657784462, "step": 255, "train_speed(iter/s)": 0.031791 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.6, "completions/mean_length": 291.32500915527345, "completions/min_length": 161.1, "epoch": 0.10505050505050505, "grad_norm": 2.7968135195637585, "kl": 0.0034709930419921874, "learning_rate": 2e-07, "loss": 0.02938370406627655, "memory(GiB)": 113.5, "reward": 0.2333333410322666, "reward_std": 0.30821192264556885, "rewards/MultiModalAccuracyORM/mean": 0.2333333410322666, "rewards/MultiModalAccuracyORM/std": 0.30821192264556885, "step": 260, "train_speed(iter/s)": 0.031882 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 635.9, "completions/mean_length": 381.02501220703124, "completions/min_length": 193.9, "epoch": 0.10707070707070707, "grad_norm": 2.2674884321553908, "kl": 0.0033966064453125, "learning_rate": 2e-07, "loss": 0.03137490749359131, "memory(GiB)": 113.5, "reward": 0.20000000149011612, "reward_std": 0.3492949903011322, "rewards/MultiModalAccuracyORM/mean": 0.20000000149011612, "rewards/MultiModalAccuracyORM/std": 0.3492949903011322, "step": 265, "train_speed(iter/s)": 0.031856 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.5, "completions/mean_length": 384.0666763305664, "completions/min_length": 238.4, "epoch": 0.10909090909090909, "grad_norm": 1.4757764767450905, "kl": 0.006084823608398437, "learning_rate": 2e-07, "loss": 0.012543919682502746, "memory(GiB)": 113.5, "reward": 0.3000000141561031, "reward_std": 0.42771587073802947, "rewards/MultiModalAccuracyORM/mean": 0.3000000141561031, "rewards/MultiModalAccuracyORM/std": 0.42771587073802947, "step": 270, "train_speed(iter/s)": 0.031865 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.9, "completions/mean_length": 362.15000610351564, "completions/min_length": 202.6, "epoch": 0.1111111111111111, "grad_norm": 2.133208686622741, "kl": 0.004328155517578125, "learning_rate": 2e-07, "loss": 0.014178204536437988, "memory(GiB)": 113.5, "reward": 0.3083333447575569, "reward_std": 0.35184402465820314, "rewards/MultiModalAccuracyORM/mean": 0.3083333447575569, "rewards/MultiModalAccuracyORM/std": 0.35184402465820314, "step": 275, "train_speed(iter/s)": 0.031998 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.6, "completions/mean_length": 274.9250061035156, "completions/min_length": 153.3, "epoch": 0.11313131313131314, "grad_norm": 2.320837755784546, "kl": 0.002793121337890625, "learning_rate": 2e-07, "loss": -0.002980351448059082, "memory(GiB)": 113.5, "reward": 0.2666666738688946, "reward_std": 0.30639869570732114, "rewards/MultiModalAccuracyORM/mean": 0.2666666738688946, "rewards/MultiModalAccuracyORM/std": 0.30639869570732114, "step": 280, "train_speed(iter/s)": 0.032128 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03333333333333333, "completions/max_length": 807.9, "completions/mean_length": 470.2083465576172, "completions/min_length": 219.6, "epoch": 0.11515151515151516, "grad_norm": 1.5979399011587243, "kl": 0.006278228759765625, "learning_rate": 2e-07, "loss": 0.01850479543209076, "memory(GiB)": 113.5, "reward": 0.39166667088866236, "reward_std": 0.4097074121236801, "rewards/MultiModalAccuracyORM/mean": 0.39166667088866236, "rewards/MultiModalAccuracyORM/std": 0.4097074121236801, "step": 285, "train_speed(iter/s)": 0.032047 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.6, "completions/mean_length": 375.62501373291013, "completions/min_length": 199.8, "epoch": 0.11717171717171718, "grad_norm": 1.6711790369238562, "kl": 0.002816009521484375, "learning_rate": 2e-07, "loss": 0.05777819156646728, "memory(GiB)": 113.5, "reward": 0.34166667237877846, "reward_std": 0.34181976318359375, "rewards/MultiModalAccuracyORM/mean": 0.34166667237877846, "rewards/MultiModalAccuracyORM/std": 0.34181976318359375, "step": 290, "train_speed(iter/s)": 0.032072 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.7, "completions/mean_length": 373.40001068115237, "completions/min_length": 222.1, "epoch": 0.1191919191919192, "grad_norm": 1.2952752164962844, "kl": 0.006529617309570313, "learning_rate": 2e-07, "loss": 0.02864307165145874, "memory(GiB)": 113.5, "reward": 0.21666667386889457, "reward_std": 0.22631654143333435, "rewards/MultiModalAccuracyORM/mean": 0.21666667386889457, "rewards/MultiModalAccuracyORM/std": 0.22631654143333435, "step": 295, "train_speed(iter/s)": 0.032146 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.7, "completions/mean_length": 389.9750091552734, "completions/min_length": 260.7, "epoch": 0.12121212121212122, "grad_norm": 2.5199865002602895, "kl": 0.00448150634765625, "learning_rate": 2e-07, "loss": 0.0044337153434753414, "memory(GiB)": 113.5, "reward": 0.3583333417773247, "reward_std": 0.3886078953742981, "rewards/MultiModalAccuracyORM/mean": 0.3583333417773247, "rewards/MultiModalAccuracyORM/std": 0.3886078953742981, "step": 300, "train_speed(iter/s)": 0.03218 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.3, "completions/mean_length": 298.0666748046875, "completions/min_length": 161.4, "epoch": 0.12323232323232323, "grad_norm": 0.04178305906141455, "kl": 0.00428619384765625, "learning_rate": 2e-07, "loss": -0.04246575832366943, "memory(GiB)": 113.5, "reward": 0.10000000223517418, "reward_std": 0.20118070244789124, "rewards/MultiModalAccuracyORM/mean": 0.10000000223517418, "rewards/MultiModalAccuracyORM/std": 0.20118070244789124, "step": 305, "train_speed(iter/s)": 0.032256 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.8, "completions/mean_length": 311.39167404174805, "completions/min_length": 131.0, "epoch": 0.12525252525252525, "grad_norm": 0.041069103688074135, "kl": 0.004656982421875, "learning_rate": 2e-07, "loss": 0.024589771032333375, "memory(GiB)": 113.5, "reward": 0.23333334401249886, "reward_std": 0.274494343996048, "rewards/MultiModalAccuracyORM/mean": 0.23333334401249886, "rewards/MultiModalAccuracyORM/std": 0.274494343996048, "step": 310, "train_speed(iter/s)": 0.032348 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.4, "completions/mean_length": 349.1750030517578, "completions/min_length": 191.6, "epoch": 0.12727272727272726, "grad_norm": 1.4578057904181938, "kl": 0.008466339111328125, "learning_rate": 2e-07, "loss": 0.019071149826049804, "memory(GiB)": 113.5, "reward": 0.18333334103226662, "reward_std": 0.24637180864810942, "rewards/MultiModalAccuracyORM/mean": 0.18333334103226662, "rewards/MultiModalAccuracyORM/std": 0.24637180864810942, "step": 315, "train_speed(iter/s)": 0.032385 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.1, "completions/mean_length": 305.83334426879884, "completions/min_length": 177.6, "epoch": 0.1292929292929293, "grad_norm": 2.0332697577512895, "kl": 0.003513336181640625, "learning_rate": 2e-07, "loss": 0.012425613403320313, "memory(GiB)": 113.5, "reward": 0.2583333395421505, "reward_std": 0.3207202464342117, "rewards/MultiModalAccuracyORM/mean": 0.2583333395421505, "rewards/MultiModalAccuracyORM/std": 0.3207202464342117, "step": 320, "train_speed(iter/s)": 0.032468 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.6, "completions/mean_length": 350.608341217041, "completions/min_length": 207.5, "epoch": 0.13131313131313133, "grad_norm": 2.9017059326660206, "kl": 0.008218002319335938, "learning_rate": 2e-07, "loss": -0.007495748996734619, "memory(GiB)": 113.5, "reward": 0.24166667237877845, "reward_std": 0.2847819983959198, "rewards/MultiModalAccuracyORM/mean": 0.24166667237877845, "rewards/MultiModalAccuracyORM/std": 0.2847819983959198, "step": 325, "train_speed(iter/s)": 0.032489 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.7, "completions/mean_length": 348.37501220703126, "completions/min_length": 230.4, "epoch": 0.13333333333333333, "grad_norm": 2.0452895180997612, "kl": 0.00405426025390625, "learning_rate": 2e-07, "loss": 0.012925130128860474, "memory(GiB)": 113.5, "reward": 0.2250000059604645, "reward_std": 0.34633229672908783, "rewards/MultiModalAccuracyORM/mean": 0.2250000059604645, "rewards/MultiModalAccuracyORM/std": 0.34633229672908783, "step": 330, "train_speed(iter/s)": 0.032601 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 618.4, "completions/mean_length": 391.80001068115234, "completions/min_length": 205.6, "epoch": 0.13535353535353536, "grad_norm": 2.3689531245965014, "kl": 0.0037220001220703127, "learning_rate": 2e-07, "loss": -0.02884441614151001, "memory(GiB)": 113.5, "reward": 0.34166667610406876, "reward_std": 0.3244759202003479, "rewards/MultiModalAccuracyORM/mean": 0.34166667610406876, "rewards/MultiModalAccuracyORM/std": 0.3244759202003479, "step": 335, "train_speed(iter/s)": 0.032628 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 711.7, "completions/mean_length": 393.9500137329102, "completions/min_length": 210.7, "epoch": 0.13737373737373737, "grad_norm": 3.1268062962961447, "kl": 0.00513458251953125, "learning_rate": 2e-07, "loss": -0.007295359671115875, "memory(GiB)": 113.5, "reward": 0.12500000447034837, "reward_std": 0.2837377518415451, "rewards/MultiModalAccuracyORM/mean": 0.12500000447034837, "rewards/MultiModalAccuracyORM/std": 0.2837377518415451, "step": 340, "train_speed(iter/s)": 0.032638 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.2, "completions/mean_length": 325.42500457763674, "completions/min_length": 202.4, "epoch": 0.1393939393939394, "grad_norm": 2.570539853128275, "kl": 0.010897064208984375, "learning_rate": 2e-07, "loss": -0.03583614826202393, "memory(GiB)": 113.5, "reward": 0.23333333879709245, "reward_std": 0.28154108226299285, "rewards/MultiModalAccuracyORM/mean": 0.23333333879709245, "rewards/MultiModalAccuracyORM/std": 0.28154108226299285, "step": 345, "train_speed(iter/s)": 0.032636 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.3, "completions/mean_length": 348.71667327880857, "completions/min_length": 202.0, "epoch": 0.1414141414141414, "grad_norm": 1.4744760782673672, "kl": 0.005255126953125, "learning_rate": 2e-07, "loss": 0.06839704513549805, "memory(GiB)": 113.5, "reward": 0.3416666738688946, "reward_std": 0.3267677813768387, "rewards/MultiModalAccuracyORM/mean": 0.3416666738688946, "rewards/MultiModalAccuracyORM/std": 0.3267677813768387, "step": 350, "train_speed(iter/s)": 0.032723 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.008333333333333333, "completions/max_length": 685.7, "completions/mean_length": 395.8666763305664, "completions/min_length": 217.6, "epoch": 0.14343434343434344, "grad_norm": 0.032365545804024926, "kl": 0.00413818359375, "learning_rate": 2e-07, "loss": -0.008323472738265992, "memory(GiB)": 113.5, "reward": 0.24166667610406875, "reward_std": 0.29187673330307007, "rewards/MultiModalAccuracyORM/mean": 0.24166667610406875, "rewards/MultiModalAccuracyORM/std": 0.29187673330307007, "step": 355, "train_speed(iter/s)": 0.032744 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.9, "completions/mean_length": 327.34167633056643, "completions/min_length": 184.3, "epoch": 0.14545454545454545, "grad_norm": 1.1619770767978876, "kl": 0.01970672607421875, "learning_rate": 2e-07, "loss": 0.014476829767227173, "memory(GiB)": 113.5, "reward": 0.3916666731238365, "reward_std": 0.35942656397819517, "rewards/MultiModalAccuracyORM/mean": 0.3916666731238365, "rewards/MultiModalAccuracyORM/std": 0.35942656397819517, "step": 360, "train_speed(iter/s)": 0.032848 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.8, "completions/mean_length": 331.90001220703124, "completions/min_length": 222.0, "epoch": 0.14747474747474748, "grad_norm": 1.4073504269814208, "kl": 0.006307220458984375, "learning_rate": 2e-07, "loss": 0.03325994312763214, "memory(GiB)": 113.5, "reward": 0.05833333432674408, "reward_std": 0.16069675385951995, "rewards/MultiModalAccuracyORM/mean": 0.05833333432674408, "rewards/MultiModalAccuracyORM/std": 0.16069675385951995, "step": 365, "train_speed(iter/s)": 0.032856 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 698.0, "completions/mean_length": 423.8916839599609, "completions/min_length": 252.2, "epoch": 0.1494949494949495, "grad_norm": 1.4976657581094635, "kl": 0.006170654296875, "learning_rate": 2e-07, "loss": -0.01670956760644913, "memory(GiB)": 113.5, "reward": 0.20000000223517417, "reward_std": 0.21999078392982482, "rewards/MultiModalAccuracyORM/mean": 0.20000000223517417, "rewards/MultiModalAccuracyORM/std": 0.21999078392982482, "step": 370, "train_speed(iter/s)": 0.032832 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 707.2, "completions/mean_length": 363.00001068115233, "completions/min_length": 182.0, "epoch": 0.15151515151515152, "grad_norm": 2.481807345956626, "kl": 0.0046051025390625, "learning_rate": 2e-07, "loss": 0.04444247186183929, "memory(GiB)": 113.5, "reward": 0.400000012665987, "reward_std": 0.3985941380262375, "rewards/MultiModalAccuracyORM/mean": 0.400000012665987, "rewards/MultiModalAccuracyORM/std": 0.3985941380262375, "step": 375, "train_speed(iter/s)": 0.032805 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.2, "completions/mean_length": 362.1333435058594, "completions/min_length": 207.8, "epoch": 0.15353535353535352, "grad_norm": 1.225556055703092, "kl": 0.01065216064453125, "learning_rate": 2e-07, "loss": 0.0010599255561828612, "memory(GiB)": 113.5, "reward": 0.2250000022351742, "reward_std": 0.22698737680912018, "rewards/MultiModalAccuracyORM/mean": 0.2250000022351742, "rewards/MultiModalAccuracyORM/std": 0.22698737680912018, "step": 380, "train_speed(iter/s)": 0.032797 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.5, "completions/mean_length": 259.9750068664551, "completions/min_length": 151.0, "epoch": 0.15555555555555556, "grad_norm": 3.170333391476991, "kl": 0.010870361328125, "learning_rate": 2e-07, "loss": 0.04853119254112244, "memory(GiB)": 113.5, "reward": 0.4500000074505806, "reward_std": 0.32345272302627565, "rewards/MultiModalAccuracyORM/mean": 0.4500000074505806, "rewards/MultiModalAccuracyORM/std": 0.32345272302627565, "step": 385, "train_speed(iter/s)": 0.032869 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 724.9, "completions/mean_length": 359.0833465576172, "completions/min_length": 170.4, "epoch": 0.15757575757575756, "grad_norm": 1.6322015536148482, "kl": 0.00597076416015625, "learning_rate": 2e-07, "loss": -0.003878127783536911, "memory(GiB)": 113.5, "reward": 0.19166667237877846, "reward_std": 0.3196614503860474, "rewards/MultiModalAccuracyORM/mean": 0.19166667237877846, "rewards/MultiModalAccuracyORM/std": 0.3196614503860474, "step": 390, "train_speed(iter/s)": 0.032905 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 692.1, "completions/mean_length": 429.06668014526366, "completions/min_length": 281.5, "epoch": 0.1595959595959596, "grad_norm": 2.750918910992668, "kl": 0.059673309326171875, "learning_rate": 2e-07, "loss": 0.016079676151275635, "memory(GiB)": 113.5, "reward": 0.14166666865348815, "reward_std": 0.23854664266109465, "rewards/MultiModalAccuracyORM/mean": 0.14166666865348815, "rewards/MultiModalAccuracyORM/std": 0.23854664266109465, "step": 395, "train_speed(iter/s)": 0.032918 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.008333333333333333, "completions/max_length": 722.7, "completions/mean_length": 381.7416793823242, "completions/min_length": 187.8, "epoch": 0.16161616161616163, "grad_norm": 1.276714724002977, "kl": 0.004840087890625, "learning_rate": 2e-07, "loss": 0.030894118547439575, "memory(GiB)": 113.5, "reward": 0.2750000074505806, "reward_std": 0.21374862194061278, "rewards/MultiModalAccuracyORM/mean": 0.2750000074505806, "rewards/MultiModalAccuracyORM/std": 0.21374862194061278, "step": 400, "train_speed(iter/s)": 0.032861 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/mean_length": 292.2416748046875, "completions/min_length": 188.5, "epoch": 0.16363636363636364, "grad_norm": 1.285497466986634, "kl": 0.00401611328125, "learning_rate": 2e-07, "loss": -0.00028939247131347655, "memory(GiB)": 113.5, "reward": 0.25833333656191826, "reward_std": 0.2986306995153427, "rewards/MultiModalAccuracyORM/mean": 0.25833333656191826, "rewards/MultiModalAccuracyORM/std": 0.2986306995153427, "step": 405, "train_speed(iter/s)": 0.032956 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.5, "completions/mean_length": 332.90001373291017, "completions/min_length": 195.8, "epoch": 0.16565656565656567, "grad_norm": 2.4986293478171695, "kl": 0.0099639892578125, "learning_rate": 2e-07, "loss": 0.01775420904159546, "memory(GiB)": 113.5, "reward": 0.14166666939854622, "reward_std": 0.2355453997850418, "rewards/MultiModalAccuracyORM/mean": 0.14166666939854622, "rewards/MultiModalAccuracyORM/std": 0.2355453997850418, "step": 410, "train_speed(iter/s)": 0.032979 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.6, "completions/mean_length": 352.77500915527344, "completions/min_length": 189.9, "epoch": 0.16767676767676767, "grad_norm": 1.8788296454969475, "kl": 0.00422210693359375, "learning_rate": 2e-07, "loss": -0.005545926094055176, "memory(GiB)": 113.5, "reward": 0.32500001043081284, "reward_std": 0.3388330668210983, "rewards/MultiModalAccuracyORM/mean": 0.32500001043081284, "rewards/MultiModalAccuracyORM/std": 0.3388330668210983, "step": 415, "train_speed(iter/s)": 0.033025 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.008333333333333333, "completions/max_length": 690.9, "completions/mean_length": 414.40001068115237, "completions/min_length": 239.5, "epoch": 0.1696969696969697, "grad_norm": 0.07032446522446908, "kl": 0.005554962158203125, "learning_rate": 2e-07, "loss": -0.002293400466442108, "memory(GiB)": 113.5, "reward": 0.20833333879709243, "reward_std": 0.21973656117916107, "rewards/MultiModalAccuracyORM/mean": 0.20833333879709243, "rewards/MultiModalAccuracyORM/std": 0.21973656117916107, "step": 420, "train_speed(iter/s)": 0.032985 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.6, "completions/mean_length": 308.7250091552734, "completions/min_length": 175.5, "epoch": 0.1717171717171717, "grad_norm": 1.4798323094999317, "kl": 0.00482025146484375, "learning_rate": 2e-07, "loss": 0.01790083050727844, "memory(GiB)": 113.5, "reward": 0.25000000521540644, "reward_std": 0.2104335606098175, "rewards/MultiModalAccuracyORM/mean": 0.25000000521540644, "rewards/MultiModalAccuracyORM/std": 0.2104335606098175, "step": 425, "train_speed(iter/s)": 0.033033 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.9, "completions/mean_length": 350.28334655761716, "completions/min_length": 202.6, "epoch": 0.17373737373737375, "grad_norm": 1.9633281758859618, "kl": 0.004430389404296875, "learning_rate": 2e-07, "loss": 0.0008227840065956116, "memory(GiB)": 113.5, "reward": 0.37500001713633535, "reward_std": 0.3780064254999161, "rewards/MultiModalAccuracyORM/mean": 0.37500001713633535, "rewards/MultiModalAccuracyORM/std": 0.3780064254999161, "step": 430, "train_speed(iter/s)": 0.033105 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.3, "completions/mean_length": 264.40834045410156, "completions/min_length": 139.7, "epoch": 0.17575757575757575, "grad_norm": 1.9529808864934317, "kl": 0.00596923828125, "learning_rate": 2e-07, "loss": -0.06038873791694641, "memory(GiB)": 113.5, "reward": 0.3333333387970924, "reward_std": 0.29837648272514344, "rewards/MultiModalAccuracyORM/mean": 0.3333333387970924, "rewards/MultiModalAccuracyORM/std": 0.29837648272514344, "step": 435, "train_speed(iter/s)": 0.033193 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.5, "completions/mean_length": 296.8333374023438, "completions/min_length": 171.6, "epoch": 0.17777777777777778, "grad_norm": 0.03169449948005974, "kl": 0.00481719970703125, "learning_rate": 2e-07, "loss": 0.018176303803920747, "memory(GiB)": 113.5, "reward": 0.25000000968575475, "reward_std": 0.2596701592206955, "rewards/MultiModalAccuracyORM/mean": 0.25000000968575475, "rewards/MultiModalAccuracyORM/std": 0.2596701592206955, "step": 440, "train_speed(iter/s)": 0.03327 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.5, "completions/mean_length": 268.50834197998046, "completions/min_length": 126.3, "epoch": 0.1797979797979798, "grad_norm": 2.4262437209194774, "kl": 0.0057281494140625, "learning_rate": 2e-07, "loss": -0.034365218877792356, "memory(GiB)": 113.5, "reward": 0.2500000074505806, "reward_std": 0.38001427948474886, "rewards/MultiModalAccuracyORM/mean": 0.2500000074505806, "rewards/MultiModalAccuracyORM/std": 0.38001427948474886, "step": 445, "train_speed(iter/s)": 0.033325 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.4, "completions/mean_length": 337.42501373291014, "completions/min_length": 194.1, "epoch": 0.18181818181818182, "grad_norm": 2.3770604401183997, "kl": 0.00361785888671875, "learning_rate": 2e-07, "loss": -0.010681581497192384, "memory(GiB)": 113.5, "reward": 0.2833333358168602, "reward_std": 0.24490799605846406, "rewards/MultiModalAccuracyORM/mean": 0.2833333358168602, "rewards/MultiModalAccuracyORM/std": 0.24490799605846406, "step": 450, "train_speed(iter/s)": 0.033355 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.008333333333333333, "completions/max_length": 645.6, "completions/mean_length": 383.0333450317383, "completions/min_length": 228.9, "epoch": 0.18383838383838383, "grad_norm": 1.5212583244692293, "kl": 0.0044342041015625, "learning_rate": 2e-07, "loss": 0.010468679666519164, "memory(GiB)": 113.5, "reward": 0.22500000447034835, "reward_std": 0.29815449118614196, "rewards/MultiModalAccuracyORM/mean": 0.22500000447034835, "rewards/MultiModalAccuracyORM/std": 0.29815449118614196, "step": 455, "train_speed(iter/s)": 0.033387 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.008333333333333333, "completions/max_length": 682.1, "completions/mean_length": 331.59167556762696, "completions/min_length": 148.5, "epoch": 0.18585858585858586, "grad_norm": 2.3101338751804605, "kl": 0.005951690673828125, "learning_rate": 2e-07, "loss": 0.013955891132354736, "memory(GiB)": 113.5, "reward": 0.2083333395421505, "reward_std": 0.3207202464342117, "rewards/MultiModalAccuracyORM/mean": 0.2083333395421505, "rewards/MultiModalAccuracyORM/std": 0.3207202464342117, "step": 460, "train_speed(iter/s)": 0.033356 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.1, "completions/mean_length": 324.4416748046875, "completions/min_length": 189.3, "epoch": 0.18787878787878787, "grad_norm": 1.9306296492930712, "kl": 0.00476531982421875, "learning_rate": 2e-07, "loss": 0.0007774412631988525, "memory(GiB)": 113.5, "reward": 0.20833333805203438, "reward_std": 0.18332210481166838, "rewards/MultiModalAccuracyORM/mean": 0.20833333805203438, "rewards/MultiModalAccuracyORM/std": 0.18332210481166838, "step": 465, "train_speed(iter/s)": 0.03337 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 696.5, "completions/mean_length": 451.75001220703126, "completions/min_length": 242.0, "epoch": 0.1898989898989899, "grad_norm": 2.9489928820712117, "kl": 0.003478240966796875, "learning_rate": 2e-07, "loss": 0.0002551078796386719, "memory(GiB)": 113.5, "reward": 0.14166666865348815, "reward_std": 0.22453648447990418, "rewards/MultiModalAccuracyORM/mean": 0.14166666865348815, "rewards/MultiModalAccuracyORM/std": 0.22453648447990418, "step": 470, "train_speed(iter/s)": 0.033353 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 655.2, "completions/mean_length": 419.60001983642576, "completions/min_length": 252.7, "epoch": 0.1919191919191919, "grad_norm": 1.657148402320105, "kl": 0.00272979736328125, "learning_rate": 2e-07, "loss": -0.02806915044784546, "memory(GiB)": 113.5, "reward": 0.25000000894069674, "reward_std": 0.3011055916547775, "rewards/MultiModalAccuracyORM/mean": 0.25000000894069674, "rewards/MultiModalAccuracyORM/std": 0.3011055916547775, "step": 475, "train_speed(iter/s)": 0.033331 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.1, "completions/mean_length": 373.71667633056643, "completions/min_length": 256.2, "epoch": 0.19393939393939394, "grad_norm": 2.869711221257181, "kl": 0.0064971923828125, "learning_rate": 2e-07, "loss": -0.002555108070373535, "memory(GiB)": 113.5, "reward": 0.3916666768491268, "reward_std": 0.2636824816465378, "rewards/MultiModalAccuracyORM/mean": 0.3916666768491268, "rewards/MultiModalAccuracyORM/std": 0.2636824816465378, "step": 480, "train_speed(iter/s)": 0.033361 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.2, "completions/mean_length": 391.5833404541016, "completions/min_length": 218.1, "epoch": 0.19595959595959597, "grad_norm": 1.9631879540052586, "kl": 0.005725860595703125, "learning_rate": 2e-07, "loss": 0.0018699795007705688, "memory(GiB)": 113.5, "reward": 0.14166667237877845, "reward_std": 0.15595400035381318, "rewards/MultiModalAccuracyORM/mean": 0.14166667237877845, "rewards/MultiModalAccuracyORM/std": 0.15595400035381318, "step": 485, "train_speed(iter/s)": 0.033379 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/mean_length": 295.8583442687988, "completions/min_length": 156.6, "epoch": 0.19797979797979798, "grad_norm": 0.037793670384228664, "kl": 0.0073211669921875, "learning_rate": 2e-07, "loss": 0.020484793186187743, "memory(GiB)": 113.5, "reward": 0.20000000149011612, "reward_std": 0.24483142793178558, "rewards/MultiModalAccuracyORM/mean": 0.20000000149011612, "rewards/MultiModalAccuracyORM/std": 0.24483142793178558, "step": 490, "train_speed(iter/s)": 0.033414 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.9, "completions/mean_length": 380.616682434082, "completions/min_length": 220.0, "epoch": 0.2, "grad_norm": 2.1512862837965163, "kl": 0.003929901123046875, "learning_rate": 2e-07, "loss": 0.0034599393606185914, "memory(GiB)": 113.5, "reward": 0.30000000521540643, "reward_std": 0.30715312659740446, "rewards/MultiModalAccuracyORM/mean": 0.30000000521540643, "rewards/MultiModalAccuracyORM/std": 0.30715312659740446, "step": 495, "train_speed(iter/s)": 0.033449 }, { "epoch": 0.20202020202020202, "grad_norm": 2.239910097717952, "learning_rate": 2e-07, "loss": 0.014047640562057494, "memory(GiB)": 113.5, "step": 500, "train_speed(iter/s)": 0.033495 }, { "epoch": 0.20202020202020202, "eval_clip_ratio": 0.0, "eval_completions/clipped_ratio": 0.0016666666666666666, "eval_completions/max_length": 591.26, "eval_completions/mean_length": 358.19000946044923, "eval_completions/min_length": 202.24, "eval_kl": 0.002655487060546875, "eval_loss": 0.00915438961237669, "eval_reward": 0.22833333894610405, "eval_reward_std": 0.28466624081134795, "eval_rewards/MultiModalAccuracyORM/mean": 0.22833333894610405, "eval_rewards/MultiModalAccuracyORM/std": 0.28466624081134795, "eval_runtime": 608.1673, "eval_samples_per_second": 0.082, "eval_steps_per_second": 0.008, "step": 500 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.35, "completions/mean_length": 332.39167404174805, "completions/min_length": 199.1, "epoch": 0.20404040404040405, "grad_norm": 2.3622087713081186, "kl": 0.004245758056640625, "learning_rate": 2e-07, "loss": -0.00013803243637084962, "memory(GiB)": 113.5, "reward": 0.3125000067055225, "reward_std": 0.3219920754432678, "rewards/MultiModalAccuracyORM/mean": 0.3125000067055225, "rewards/MultiModalAccuracyORM/std": 0.3219920754432678, "step": 505, "train_speed(iter/s)": 0.031802 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.3, "completions/mean_length": 374.4833450317383, "completions/min_length": 209.5, "epoch": 0.20606060606060606, "grad_norm": 1.7757575475794216, "kl": 0.006531524658203125, "learning_rate": 2e-07, "loss": 0.03503022789955139, "memory(GiB)": 113.5, "reward": 0.29166667312383654, "reward_std": 0.28778324127197263, "rewards/MultiModalAccuracyORM/mean": 0.29166667312383654, "rewards/MultiModalAccuracyORM/std": 0.28778324127197263, "step": 510, "train_speed(iter/s)": 0.031819 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.2, "completions/mean_length": 300.96667633056643, "completions/min_length": 179.3, "epoch": 0.2080808080808081, "grad_norm": 2.2727530064482235, "kl": 0.01416778564453125, "learning_rate": 2e-07, "loss": 0.022283512353897094, "memory(GiB)": 113.5, "reward": 0.24166667610406875, "reward_std": 0.3347875773906708, "rewards/MultiModalAccuracyORM/mean": 0.24166667610406875, "rewards/MultiModalAccuracyORM/std": 0.3347875773906708, "step": 515, "train_speed(iter/s)": 0.03184 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.9, "completions/mean_length": 341.6000099182129, "completions/min_length": 175.8, "epoch": 0.2101010101010101, "grad_norm": 1.1487867895660082, "kl": 0.00421295166015625, "learning_rate": 2e-07, "loss": 0.04290072023868561, "memory(GiB)": 113.5, "reward": 0.3666666761040688, "reward_std": 0.28399197161197665, "rewards/MultiModalAccuracyORM/mean": 0.3666666761040688, "rewards/MultiModalAccuracyORM/std": 0.28399197161197665, "step": 520, "train_speed(iter/s)": 0.03186 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.5, "completions/mean_length": 344.0500091552734, "completions/min_length": 175.8, "epoch": 0.21212121212121213, "grad_norm": 2.2941717609617767, "kl": 0.0046539306640625, "learning_rate": 2e-07, "loss": 0.004269888997077942, "memory(GiB)": 113.5, "reward": 0.30833333879709246, "reward_std": 0.3267677813768387, "rewards/MultiModalAccuracyORM/mean": 0.30833333879709246, "rewards/MultiModalAccuracyORM/std": 0.3267677813768387, "step": 525, "train_speed(iter/s)": 0.031902 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.2, "completions/mean_length": 345.83334197998045, "completions/min_length": 172.9, "epoch": 0.21414141414141413, "grad_norm": 1.2948745647020719, "kl": 0.004862213134765625, "learning_rate": 2e-07, "loss": -0.007743622362613678, "memory(GiB)": 113.5, "reward": 0.33333333805203436, "reward_std": 0.25897532403469087, "rewards/MultiModalAccuracyORM/mean": 0.33333333805203436, "rewards/MultiModalAccuracyORM/std": 0.25897532403469087, "step": 530, "train_speed(iter/s)": 0.031973 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 277.8500061035156, "completions/min_length": 127.7, "epoch": 0.21616161616161617, "grad_norm": 2.820652916445064, "kl": 0.004701995849609375, "learning_rate": 2e-07, "loss": 0.019122210144996644, "memory(GiB)": 113.5, "reward": 0.25833334028720856, "reward_std": 0.38930273354053496, "rewards/MultiModalAccuracyORM/mean": 0.25833334028720856, "rewards/MultiModalAccuracyORM/std": 0.38930273354053496, "step": 535, "train_speed(iter/s)": 0.032051 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.3, "completions/mean_length": 311.0666732788086, "completions/min_length": 155.9, "epoch": 0.21818181818181817, "grad_norm": 0.02000320571323216, "kl": 0.006194305419921875, "learning_rate": 2e-07, "loss": 0.023233750462532045, "memory(GiB)": 113.5, "reward": 0.29166667237877847, "reward_std": 0.26298522055149076, "rewards/MultiModalAccuracyORM/mean": 0.29166667237877847, "rewards/MultiModalAccuracyORM/std": 0.26298522055149076, "step": 540, "train_speed(iter/s)": 0.032099 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.1, "completions/mean_length": 298.1750099182129, "completions/min_length": 159.8, "epoch": 0.2202020202020202, "grad_norm": 1.7992434949177767, "kl": 0.00469207763671875, "learning_rate": 2e-07, "loss": 0.015616017580032348, "memory(GiB)": 113.5, "reward": 0.32500000596046447, "reward_std": 0.22704697251319886, "rewards/MultiModalAccuracyORM/mean": 0.32500000596046447, "rewards/MultiModalAccuracyORM/std": 0.22704697251319886, "step": 545, "train_speed(iter/s)": 0.032156 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.6, "completions/mean_length": 273.84167633056643, "completions/min_length": 145.6, "epoch": 0.2222222222222222, "grad_norm": 2.8559923799679794, "kl": 0.00508270263671875, "learning_rate": 2e-07, "loss": 0.050173360109329226, "memory(GiB)": 113.5, "reward": 0.37500001341104505, "reward_std": 0.33303394317626955, "rewards/MultiModalAccuracyORM/mean": 0.37500001341104505, "rewards/MultiModalAccuracyORM/std": 0.33303394317626955, "step": 550, "train_speed(iter/s)": 0.032216 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.5, "completions/mean_length": 421.71667633056643, "completions/min_length": 240.9, "epoch": 0.22424242424242424, "grad_norm": 2.3260782482625366, "kl": 0.005718994140625, "learning_rate": 2e-07, "loss": 0.02654660940170288, "memory(GiB)": 113.5, "reward": 0.36666667759418486, "reward_std": 0.46648178398609164, "rewards/MultiModalAccuracyORM/mean": 0.36666667759418486, "rewards/MultiModalAccuracyORM/std": 0.46648178398609164, "step": 555, "train_speed(iter/s)": 0.032254 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 695.9, "completions/mean_length": 376.9666717529297, "completions/min_length": 211.5, "epoch": 0.22626262626262628, "grad_norm": 1.9191123297699473, "kl": 0.0039215087890625, "learning_rate": 2e-07, "loss": 0.013482053577899934, "memory(GiB)": 113.5, "reward": 0.1750000037252903, "reward_std": 0.3042020261287689, "rewards/MultiModalAccuracyORM/mean": 0.1750000037252903, "rewards/MultiModalAccuracyORM/std": 0.3042020261287689, "step": 560, "train_speed(iter/s)": 0.032243 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.5, "completions/mean_length": 348.8333450317383, "completions/min_length": 184.5, "epoch": 0.22828282828282828, "grad_norm": 2.0009650914845873, "kl": 0.01170501708984375, "learning_rate": 2e-07, "loss": 0.035267585515975954, "memory(GiB)": 113.5, "reward": 0.3583333410322666, "reward_std": 0.38205191493034363, "rewards/MultiModalAccuracyORM/mean": 0.3583333410322666, "rewards/MultiModalAccuracyORM/std": 0.38205191493034363, "step": 565, "train_speed(iter/s)": 0.032295 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.6, "completions/mean_length": 319.6166748046875, "completions/min_length": 181.1, "epoch": 0.23030303030303031, "grad_norm": 0.1996246857202343, "kl": 0.005075836181640625, "learning_rate": 2e-07, "loss": -0.02471494972705841, "memory(GiB)": 113.5, "reward": 0.24166666939854622, "reward_std": 0.2549654275178909, "rewards/MultiModalAccuracyORM/mean": 0.24166666939854622, "rewards/MultiModalAccuracyORM/std": 0.2549654275178909, "step": 570, "train_speed(iter/s)": 0.032297 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/mean_length": 338.40833892822263, "completions/min_length": 187.7, "epoch": 0.23232323232323232, "grad_norm": 2.3362669602060033, "kl": 0.00451202392578125, "learning_rate": 2e-07, "loss": 0.03307419717311859, "memory(GiB)": 113.5, "reward": 0.2000000037252903, "reward_std": 0.3081523299217224, "rewards/MultiModalAccuracyORM/mean": 0.2000000037252903, "rewards/MultiModalAccuracyORM/std": 0.3081523299217224, "step": 575, "train_speed(iter/s)": 0.03234 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.9, "completions/mean_length": 292.68334197998047, "completions/min_length": 158.3, "epoch": 0.23434343434343435, "grad_norm": 2.8417938649503394, "kl": 0.014810943603515625, "learning_rate": 2e-07, "loss": -0.03590070009231568, "memory(GiB)": 113.5, "reward": 0.3500000089406967, "reward_std": 0.39629932343959806, "rewards/MultiModalAccuracyORM/mean": 0.3500000089406967, "rewards/MultiModalAccuracyORM/std": 0.39629932343959806, "step": 580, "train_speed(iter/s)": 0.032381 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.8, "completions/mean_length": 396.57501831054685, "completions/min_length": 239.6, "epoch": 0.23636363636363636, "grad_norm": 0.03715745404820811, "kl": 0.00491180419921875, "learning_rate": 2e-07, "loss": -0.0016106054186820983, "memory(GiB)": 113.5, "reward": 0.25000000596046446, "reward_std": 0.27749558687210085, "rewards/MultiModalAccuracyORM/mean": 0.25000000596046446, "rewards/MultiModalAccuracyORM/std": 0.27749558687210085, "step": 585, "train_speed(iter/s)": 0.032385 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.1, "completions/mean_length": 371.41667861938475, "completions/min_length": 206.0, "epoch": 0.2383838383838384, "grad_norm": 2.5904505607936237, "kl": 0.0038330078125, "learning_rate": 2e-07, "loss": -0.0013609230518341064, "memory(GiB)": 113.5, "reward": 0.4416666768491268, "reward_std": 0.3044206529855728, "rewards/MultiModalAccuracyORM/mean": 0.4416666768491268, "rewards/MultiModalAccuracyORM/std": 0.3044206529855728, "step": 590, "train_speed(iter/s)": 0.032404 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.8, "completions/mean_length": 332.2250091552734, "completions/min_length": 175.5, "epoch": 0.2404040404040404, "grad_norm": 3.252161568752739, "kl": 0.00532073974609375, "learning_rate": 2e-07, "loss": 0.022338399291038515, "memory(GiB)": 113.5, "reward": 0.316666679084301, "reward_std": 0.35766714811325073, "rewards/MultiModalAccuracyORM/mean": 0.316666679084301, "rewards/MultiModalAccuracyORM/std": 0.35766714811325073, "step": 595, "train_speed(iter/s)": 0.032438 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.6, "completions/mean_length": 366.18334045410154, "completions/min_length": 210.1, "epoch": 0.24242424242424243, "grad_norm": 1.7160058152461715, "kl": 0.0050140380859375, "learning_rate": 2e-07, "loss": -0.0045736730098724365, "memory(GiB)": 113.5, "reward": 0.3250000074505806, "reward_std": 0.32682737708091736, "rewards/MultiModalAccuracyORM/mean": 0.3250000074505806, "rewards/MultiModalAccuracyORM/std": 0.32682737708091736, "step": 600, "train_speed(iter/s)": 0.032452 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.016666666666666666, "completions/max_length": 968.4, "completions/mean_length": 407.6500114440918, "completions/min_length": 221.4, "epoch": 0.24444444444444444, "grad_norm": 1.5763528784256282, "kl": 0.0037322998046875, "learning_rate": 2e-07, "loss": 0.003979828953742981, "memory(GiB)": 113.5, "reward": 0.30000000819563866, "reward_std": 0.4196960777044296, "rewards/MultiModalAccuracyORM/mean": 0.30000000819563866, "rewards/MultiModalAccuracyORM/std": 0.4196960777044296, "step": 605, "train_speed(iter/s)": 0.0324 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.6, "completions/mean_length": 293.57500610351565, "completions/min_length": 166.8, "epoch": 0.24646464646464647, "grad_norm": 3.2425538671850047, "kl": 0.009912109375, "learning_rate": 2e-07, "loss": 0.024757757782936096, "memory(GiB)": 113.5, "reward": 0.20000000670552254, "reward_std": 0.2184557795524597, "rewards/MultiModalAccuracyORM/mean": 0.20000000670552254, "rewards/MultiModalAccuracyORM/std": 0.2184557795524597, "step": 610, "train_speed(iter/s)": 0.032454 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.6, "completions/mean_length": 272.27501068115237, "completions/min_length": 161.5, "epoch": 0.24848484848484848, "grad_norm": 2.9002217301843682, "kl": 0.006238555908203125, "learning_rate": 2e-07, "loss": 0.006809020042419433, "memory(GiB)": 113.5, "reward": 0.2083333373069763, "reward_std": 0.28784283697605134, "rewards/MultiModalAccuracyORM/mean": 0.2083333373069763, "rewards/MultiModalAccuracyORM/std": 0.28784283697605134, "step": 615, "train_speed(iter/s)": 0.032521 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.6, "completions/mean_length": 323.6333435058594, "completions/min_length": 178.6, "epoch": 0.2505050505050505, "grad_norm": 1.6543202512317519, "kl": 0.005059814453125, "learning_rate": 2e-07, "loss": -0.013031059503555298, "memory(GiB)": 113.5, "reward": 0.21666667088866234, "reward_std": 0.22625694572925567, "rewards/MultiModalAccuracyORM/mean": 0.21666667088866234, "rewards/MultiModalAccuracyORM/std": 0.22625694572925567, "step": 620, "train_speed(iter/s)": 0.032557 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.9, "completions/mean_length": 368.2416793823242, "completions/min_length": 204.2, "epoch": 0.25252525252525254, "grad_norm": 1.9398904097162017, "kl": 0.00662689208984375, "learning_rate": 2e-07, "loss": 0.020694077014923096, "memory(GiB)": 113.5, "reward": 0.20833333805203438, "reward_std": 0.2567190587520599, "rewards/MultiModalAccuracyORM/mean": 0.20833333805203438, "rewards/MultiModalAccuracyORM/std": 0.2567190587520599, "step": 625, "train_speed(iter/s)": 0.032596 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.2, "completions/mean_length": 298.1750030517578, "completions/min_length": 161.6, "epoch": 0.2545454545454545, "grad_norm": 3.9909953401900657, "kl": 0.00722808837890625, "learning_rate": 2e-07, "loss": 0.012149769067764282, "memory(GiB)": 113.5, "reward": 0.3500000089406967, "reward_std": 0.21594529151916503, "rewards/MultiModalAccuracyORM/mean": 0.3500000089406967, "rewards/MultiModalAccuracyORM/std": 0.21594529151916503, "step": 630, "train_speed(iter/s)": 0.03264 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.6, "completions/mean_length": 335.81667404174806, "completions/min_length": 203.4, "epoch": 0.25656565656565655, "grad_norm": 3.223504719612698, "kl": 0.00595703125, "learning_rate": 2e-07, "loss": 0.0355703592300415, "memory(GiB)": 113.5, "reward": 0.39166667833924296, "reward_std": 0.3838055461645126, "rewards/MultiModalAccuracyORM/mean": 0.39166667833924296, "rewards/MultiModalAccuracyORM/std": 0.3838055461645126, "step": 635, "train_speed(iter/s)": 0.032663 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 661.0, "completions/mean_length": 399.683349609375, "completions/min_length": 215.3, "epoch": 0.2585858585858586, "grad_norm": 0.031047629899617252, "kl": 0.00643310546875, "learning_rate": 2e-07, "loss": -0.002796703577041626, "memory(GiB)": 113.5, "reward": 0.20000000670552254, "reward_std": 0.24866367280483245, "rewards/MultiModalAccuracyORM/mean": 0.20000000670552254, "rewards/MultiModalAccuracyORM/std": 0.24866367280483245, "step": 640, "train_speed(iter/s)": 0.032678 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.7, "completions/mean_length": 308.32500915527345, "completions/min_length": 162.9, "epoch": 0.2606060606060606, "grad_norm": 2.661462961010607, "kl": 0.0068939208984375, "learning_rate": 2e-07, "loss": 0.006179103255271911, "memory(GiB)": 113.5, "reward": 0.25833333656191826, "reward_std": 0.2652174860239029, "rewards/MultiModalAccuracyORM/mean": 0.25833333656191826, "rewards/MultiModalAccuracyORM/std": 0.2652174860239029, "step": 645, "train_speed(iter/s)": 0.032733 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.6, "completions/mean_length": 246.7083396911621, "completions/min_length": 116.2, "epoch": 0.26262626262626265, "grad_norm": 2.282962166826479, "kl": 0.00615692138671875, "learning_rate": 2e-07, "loss": -0.022863130271434783, "memory(GiB)": 113.5, "reward": 0.22500000149011612, "reward_std": 0.25664491653442384, "rewards/MultiModalAccuracyORM/mean": 0.22500000149011612, "rewards/MultiModalAccuracyORM/std": 0.25664491653442384, "step": 650, "train_speed(iter/s)": 0.032795 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.1, "completions/mean_length": 367.6666763305664, "completions/min_length": 205.7, "epoch": 0.26464646464646463, "grad_norm": 2.380599579002688, "kl": 0.0057464599609375, "learning_rate": 2e-07, "loss": -0.013085761666297912, "memory(GiB)": 113.5, "reward": 0.2583333395421505, "reward_std": 0.2993255376815796, "rewards/MultiModalAccuracyORM/mean": 0.2583333395421505, "rewards/MultiModalAccuracyORM/std": 0.2993255376815796, "step": 655, "train_speed(iter/s)": 0.032829 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.3, "completions/mean_length": 398.7916854858398, "completions/min_length": 212.0, "epoch": 0.26666666666666666, "grad_norm": 2.1060964762409085, "kl": 0.0065460205078125, "learning_rate": 2e-07, "loss": 0.001984366774559021, "memory(GiB)": 113.5, "reward": 0.2583333417773247, "reward_std": 0.35184402465820314, "rewards/MultiModalAccuracyORM/mean": 0.2583333417773247, "rewards/MultiModalAccuracyORM/std": 0.35184402465820314, "step": 660, "train_speed(iter/s)": 0.032832 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.5, "completions/mean_length": 379.8500099182129, "completions/min_length": 222.5, "epoch": 0.2686868686868687, "grad_norm": 3.0979902221373083, "kl": 0.00526275634765625, "learning_rate": 2e-07, "loss": -0.00811660885810852, "memory(GiB)": 113.5, "reward": 0.2583333358168602, "reward_std": 0.22446234226226808, "rewards/MultiModalAccuracyORM/mean": 0.2583333358168602, "rewards/MultiModalAccuracyORM/std": 0.22446234226226808, "step": 665, "train_speed(iter/s)": 0.03283 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.4, "completions/mean_length": 274.05001068115234, "completions/min_length": 143.8, "epoch": 0.27070707070707073, "grad_norm": 2.886266049614615, "kl": 0.00730133056640625, "learning_rate": 2e-07, "loss": 0.008006072044372559, "memory(GiB)": 113.5, "reward": 0.2500000037252903, "reward_std": 0.3111986219882965, "rewards/MultiModalAccuracyORM/mean": 0.2500000037252903, "rewards/MultiModalAccuracyORM/std": 0.3111986219882965, "step": 670, "train_speed(iter/s)": 0.032864 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.8, "completions/mean_length": 304.02500915527344, "completions/min_length": 156.8, "epoch": 0.2727272727272727, "grad_norm": 0.3952238447884339, "kl": 0.0077239990234375, "learning_rate": 2e-07, "loss": 0.036022895574569704, "memory(GiB)": 113.5, "reward": 0.37500001192092897, "reward_std": 0.32858100831508635, "rewards/MultiModalAccuracyORM/mean": 0.37500001192092897, "rewards/MultiModalAccuracyORM/std": 0.32858100831508635, "step": 675, "train_speed(iter/s)": 0.032929 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 632.8, "completions/mean_length": 364.55834197998047, "completions/min_length": 203.6, "epoch": 0.27474747474747474, "grad_norm": 0.0686693440428557, "kl": 0.00585784912109375, "learning_rate": 2e-07, "loss": 0.006394723057746887, "memory(GiB)": 113.5, "reward": 0.18333334103226662, "reward_std": 0.24637180864810942, "rewards/MultiModalAccuracyORM/mean": 0.18333334103226662, "rewards/MultiModalAccuracyORM/std": 0.24637180864810942, "step": 680, "train_speed(iter/s)": 0.032965 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.4, "completions/mean_length": 235.7500099182129, "completions/min_length": 118.5, "epoch": 0.2767676767676768, "grad_norm": 2.1797104035382873, "kl": 0.01773681640625, "learning_rate": 2e-07, "loss": 0.008138242363929748, "memory(GiB)": 113.5, "reward": 0.26666667610406875, "reward_std": 0.3862804383039474, "rewards/MultiModalAccuracyORM/mean": 0.26666667610406875, "rewards/MultiModalAccuracyORM/std": 0.3862804383039474, "step": 685, "train_speed(iter/s)": 0.032993 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 707.3, "completions/mean_length": 420.05834503173827, "completions/min_length": 225.0, "epoch": 0.2787878787878788, "grad_norm": 1.2422518482071012, "kl": 0.005517578125, "learning_rate": 2e-07, "loss": -0.025521010160446167, "memory(GiB)": 113.5, "reward": 0.07500000223517418, "reward_std": 0.22218745648860933, "rewards/MultiModalAccuracyORM/mean": 0.07500000223517418, "rewards/MultiModalAccuracyORM/std": 0.22218745648860933, "step": 690, "train_speed(iter/s)": 0.032968 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.4, "completions/mean_length": 335.8666763305664, "completions/min_length": 201.2, "epoch": 0.2808080808080808, "grad_norm": 0.8721711597058662, "kl": 0.007273101806640625, "learning_rate": 2e-07, "loss": -0.005113717913627624, "memory(GiB)": 113.5, "reward": 0.2083333395421505, "reward_std": 0.32370694279670714, "rewards/MultiModalAccuracyORM/mean": 0.2083333395421505, "rewards/MultiModalAccuracyORM/std": 0.32370694279670714, "step": 695, "train_speed(iter/s)": 0.032988 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.008333333333333333, "completions/max_length": 995.9, "completions/mean_length": 439.34168548583983, "completions/min_length": 226.8, "epoch": 0.2828282828282828, "grad_norm": 2.6514991151372906, "kl": 0.00522308349609375, "learning_rate": 2e-07, "loss": 0.03241249620914459, "memory(GiB)": 113.5, "reward": 0.12500000298023223, "reward_std": 0.25916995108127594, "rewards/MultiModalAccuracyORM/mean": 0.12500000298023223, "rewards/MultiModalAccuracyORM/std": 0.25916995108127594, "step": 700, "train_speed(iter/s)": 0.032951 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/mean_length": 208.48334121704102, "completions/min_length": 125.4, "epoch": 0.28484848484848485, "grad_norm": 2.9111051216276933, "kl": 0.00951080322265625, "learning_rate": 2e-07, "loss": 0.011016063392162323, "memory(GiB)": 113.5, "reward": 0.49166668131947516, "reward_std": 0.3610968828201294, "rewards/MultiModalAccuracyORM/mean": 0.49166668131947516, "rewards/MultiModalAccuracyORM/std": 0.3610968828201294, "step": 705, "train_speed(iter/s)": 0.033011 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.008333333333333333, "completions/max_length": 684.6, "completions/mean_length": 394.20833587646484, "completions/min_length": 217.3, "epoch": 0.2868686868686869, "grad_norm": 0.9791505372375504, "kl": 0.0062164306640625, "learning_rate": 2e-07, "loss": 0.009031829237937928, "memory(GiB)": 113.5, "reward": 0.4000000037252903, "reward_std": 0.2825257331132889, "rewards/MultiModalAccuracyORM/mean": 0.4000000037252903, "rewards/MultiModalAccuracyORM/std": 0.2825257331132889, "step": 710, "train_speed(iter/s)": 0.032996 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.9, "completions/mean_length": 316.80834197998047, "completions/min_length": 187.2, "epoch": 0.28888888888888886, "grad_norm": 2.748998227170115, "kl": 0.0071197509765625, "learning_rate": 2e-07, "loss": -0.04136030673980713, "memory(GiB)": 113.5, "reward": 0.17500000745058059, "reward_std": 0.2551840543746948, "rewards/MultiModalAccuracyORM/mean": 0.17500000745058059, "rewards/MultiModalAccuracyORM/std": 0.2551840543746948, "step": 715, "train_speed(iter/s)": 0.033011 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/mean_length": 334.85834884643555, "completions/min_length": 158.8, "epoch": 0.2909090909090909, "grad_norm": 2.5712788721497355, "kl": 0.005682373046875, "learning_rate": 2e-07, "loss": 0.014300698041915893, "memory(GiB)": 113.5, "reward": 0.4000000089406967, "reward_std": 0.26816858947277067, "rewards/MultiModalAccuracyORM/mean": 0.4000000089406967, "rewards/MultiModalAccuracyORM/std": 0.26816858947277067, "step": 720, "train_speed(iter/s)": 0.033012 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.008333333333333333, "completions/max_length": 613.2, "completions/mean_length": 331.2666763305664, "completions/min_length": 199.3, "epoch": 0.29292929292929293, "grad_norm": 1.8581663152703145, "kl": 0.00664215087890625, "learning_rate": 2e-07, "loss": -0.010144461691379548, "memory(GiB)": 113.5, "reward": 0.21666667386889457, "reward_std": 0.31676994562149047, "rewards/MultiModalAccuracyORM/mean": 0.21666667386889457, "rewards/MultiModalAccuracyORM/std": 0.31676994562149047, "step": 725, "train_speed(iter/s)": 0.033019 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.9, "completions/mean_length": 326.3500129699707, "completions/min_length": 181.5, "epoch": 0.29494949494949496, "grad_norm": 0.775697379774875, "kl": 0.00522918701171875, "learning_rate": 2e-07, "loss": 0.0003711044788360596, "memory(GiB)": 113.5, "reward": 0.2333333373069763, "reward_std": 0.3189997851848602, "rewards/MultiModalAccuracyORM/mean": 0.2333333373069763, "rewards/MultiModalAccuracyORM/std": 0.3189997851848602, "step": 730, "train_speed(iter/s)": 0.033016 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.2, "completions/mean_length": 330.55001068115234, "completions/min_length": 198.5, "epoch": 0.296969696969697, "grad_norm": 1.5273483945564796, "kl": 0.006597900390625, "learning_rate": 2e-07, "loss": 0.012019181251525879, "memory(GiB)": 113.5, "reward": 0.34166667312383653, "reward_std": 0.4374805331230164, "rewards/MultiModalAccuracyORM/mean": 0.34166667312383653, "rewards/MultiModalAccuracyORM/std": 0.4374805331230164, "step": 735, "train_speed(iter/s)": 0.033045 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.7, "completions/mean_length": 381.558349609375, "completions/min_length": 230.5, "epoch": 0.298989898989899, "grad_norm": 2.3594893660788374, "kl": 0.0050323486328125, "learning_rate": 2e-07, "loss": 0.01788020133972168, "memory(GiB)": 113.5, "reward": 0.20000000596046447, "reward_std": 0.3330695390701294, "rewards/MultiModalAccuracyORM/mean": 0.20000000596046447, "rewards/MultiModalAccuracyORM/std": 0.3330695390701294, "step": 740, "train_speed(iter/s)": 0.033058 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.7, "completions/mean_length": 324.033349609375, "completions/min_length": 184.4, "epoch": 0.301010101010101, "grad_norm": 2.29708410353418, "kl": 0.00646209716796875, "learning_rate": 2e-07, "loss": -0.009415292739868164, "memory(GiB)": 113.5, "reward": 0.41666667386889455, "reward_std": 0.2529277890920639, "rewards/MultiModalAccuracyORM/mean": 0.41666667386889455, "rewards/MultiModalAccuracyORM/std": 0.2529277890920639, "step": 745, "train_speed(iter/s)": 0.033072 }, { "epoch": 0.30303030303030304, "grad_norm": 2.3159606947695557, "learning_rate": 2e-07, "loss": 0.006078800559043885, "memory(GiB)": 113.5, "step": 750, "train_speed(iter/s)": 0.033129 }, { "epoch": 0.30303030303030304, "eval_clip_ratio": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 539.88, "eval_completions/mean_length": 336.97334396362305, "eval_completions/min_length": 192.2, "eval_kl": 0.00380157470703125, "eval_loss": 0.01653137058019638, "eval_reward": 0.2800000062584877, "eval_reward_std": 0.28693030297756195, "eval_rewards/MultiModalAccuracyORM/mean": 0.2800000062584877, "eval_rewards/MultiModalAccuracyORM/std": 0.28693030297756195, "eval_runtime": 588.5073, "eval_samples_per_second": 0.085, "eval_steps_per_second": 0.008, "step": 750 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.004166666666666667, "completions/max_length": 541.6, "completions/mean_length": 313.96250953674314, "completions/min_length": 176.25, "epoch": 0.30505050505050507, "grad_norm": 1.6419689379277844, "kl": 0.008066558837890625, "learning_rate": 2e-07, "loss": -0.004991033673286438, "memory(GiB)": 113.5, "reward": 0.31250000894069674, "reward_std": 0.35801745802164076, "rewards/MultiModalAccuracyORM/mean": 0.31250000894069674, "rewards/MultiModalAccuracyORM/std": 0.35801745802164076, "step": 755, "train_speed(iter/s)": 0.031887 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.9, "completions/mean_length": 292.9916732788086, "completions/min_length": 150.9, "epoch": 0.30707070707070705, "grad_norm": 2.0844707046825723, "kl": 0.00627288818359375, "learning_rate": 2e-07, "loss": 0.0167288139462471, "memory(GiB)": 113.5, "reward": 0.21666666865348816, "reward_std": 0.3554166704416275, "rewards/MultiModalAccuracyORM/mean": 0.21666666865348816, "rewards/MultiModalAccuracyORM/std": 0.3554166704416275, "step": 760, "train_speed(iter/s)": 0.031901 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.4, "completions/mean_length": 296.4166778564453, "completions/min_length": 161.6, "epoch": 0.3090909090909091, "grad_norm": 1.7792521459456232, "kl": 0.01121368408203125, "learning_rate": 2e-07, "loss": 0.017529194056987763, "memory(GiB)": 113.5, "reward": 0.4000000134110451, "reward_std": 0.3734437495470047, "rewards/MultiModalAccuracyORM/mean": 0.4000000134110451, "rewards/MultiModalAccuracyORM/std": 0.3734437495470047, "step": 765, "train_speed(iter/s)": 0.031933 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.8, "completions/mean_length": 228.51667175292968, "completions/min_length": 120.5, "epoch": 0.3111111111111111, "grad_norm": 2.1702261558412697, "kl": 0.0090240478515625, "learning_rate": 2e-07, "loss": -0.05565891861915588, "memory(GiB)": 113.5, "reward": 0.15000000596046448, "reward_std": 0.24261613488197326, "rewards/MultiModalAccuracyORM/mean": 0.15000000596046448, "rewards/MultiModalAccuracyORM/std": 0.24261613488197326, "step": 770, "train_speed(iter/s)": 0.031968 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.4, "completions/mean_length": 306.7916748046875, "completions/min_length": 165.3, "epoch": 0.31313131313131315, "grad_norm": 1.2794183651984758, "kl": 0.00740814208984375, "learning_rate": 2e-07, "loss": 0.04246864318847656, "memory(GiB)": 113.5, "reward": 0.46666667982935905, "reward_std": 0.4767192959785461, "rewards/MultiModalAccuracyORM/mean": 0.46666667982935905, "rewards/MultiModalAccuracyORM/std": 0.4767192959785461, "step": 775, "train_speed(iter/s)": 0.032 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.4, "completions/mean_length": 415.7916763305664, "completions/min_length": 231.3, "epoch": 0.3151515151515151, "grad_norm": 1.1135361589863462, "kl": 0.00513763427734375, "learning_rate": 2e-07, "loss": 0.028287124633789063, "memory(GiB)": 113.5, "reward": 0.15000000596046448, "reward_std": 0.18482151329517366, "rewards/MultiModalAccuracyORM/mean": 0.15000000596046448, "rewards/MultiModalAccuracyORM/std": 0.18482151329517366, "step": 780, "train_speed(iter/s)": 0.031996 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 629.5, "completions/mean_length": 379.56668243408205, "completions/min_length": 220.9, "epoch": 0.31717171717171716, "grad_norm": 2.365467793016899, "kl": 0.0066986083984375, "learning_rate": 2e-07, "loss": -0.014639610052108764, "memory(GiB)": 113.5, "reward": 0.1833333358168602, "reward_std": 0.20363159477710724, "rewards/MultiModalAccuracyORM/mean": 0.1833333358168602, "rewards/MultiModalAccuracyORM/std": 0.20363159477710724, "step": 785, "train_speed(iter/s)": 0.032005 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.3, "completions/mean_length": 267.75834503173826, "completions/min_length": 155.6, "epoch": 0.3191919191919192, "grad_norm": 1.7124152646997672, "kl": 0.00589141845703125, "learning_rate": 2e-07, "loss": 0.001770263910293579, "memory(GiB)": 113.5, "reward": 0.33333333730697634, "reward_std": 0.29483942985534667, "rewards/MultiModalAccuracyORM/mean": 0.33333333730697634, "rewards/MultiModalAccuracyORM/std": 0.29483942985534667, "step": 790, "train_speed(iter/s)": 0.032028 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.7, "completions/mean_length": 331.6916763305664, "completions/min_length": 190.2, "epoch": 0.3212121212121212, "grad_norm": 2.1658449229692316, "kl": 0.0065338134765625, "learning_rate": 2e-07, "loss": 0.018888431787490844, "memory(GiB)": 113.5, "reward": 0.22500001192092894, "reward_std": 0.3477985322475433, "rewards/MultiModalAccuracyORM/mean": 0.22500001192092894, "rewards/MultiModalAccuracyORM/std": 0.3477985322475433, "step": 795, "train_speed(iter/s)": 0.032028 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.9, "completions/mean_length": 319.0250076293945, "completions/min_length": 159.2, "epoch": 0.32323232323232326, "grad_norm": 0.10193444456144864, "kl": 0.0068878173828125, "learning_rate": 2e-07, "loss": 0.008858251571655273, "memory(GiB)": 113.5, "reward": 0.1916666693985462, "reward_std": 0.2567190587520599, "rewards/MultiModalAccuracyORM/mean": 0.1916666693985462, "rewards/MultiModalAccuracyORM/std": 0.2567190587520599, "step": 800, "train_speed(iter/s)": 0.032045 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.7, "completions/mean_length": 316.66667556762695, "completions/min_length": 179.2, "epoch": 0.32525252525252524, "grad_norm": 2.315498401390807, "kl": 0.00754852294921875, "learning_rate": 2e-07, "loss": -0.002603813260793686, "memory(GiB)": 113.5, "reward": 0.33333333656191827, "reward_std": 0.2722736746072769, "rewards/MultiModalAccuracyORM/mean": 0.33333333656191827, "rewards/MultiModalAccuracyORM/std": 0.2722736746072769, "step": 805, "train_speed(iter/s)": 0.03204 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.008333333333333333, "completions/max_length": 675.0, "completions/mean_length": 342.4083450317383, "completions/min_length": 170.8, "epoch": 0.32727272727272727, "grad_norm": 3.1462959818853165, "kl": 0.006378173828125, "learning_rate": 2e-07, "loss": -0.010855591297149659, "memory(GiB)": 113.5, "reward": 0.1916666693985462, "reward_std": 0.3259988039731979, "rewards/MultiModalAccuracyORM/mean": 0.1916666693985462, "rewards/MultiModalAccuracyORM/std": 0.3259988039731979, "step": 810, "train_speed(iter/s)": 0.032048 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 674.7, "completions/mean_length": 386.7166778564453, "completions/min_length": 189.5, "epoch": 0.3292929292929293, "grad_norm": 2.441646562638453, "kl": 0.008112335205078125, "learning_rate": 2e-07, "loss": 0.022695478796958924, "memory(GiB)": 113.5, "reward": 0.30833333507180216, "reward_std": 0.29793586432933805, "rewards/MultiModalAccuracyORM/mean": 0.30833333507180216, "rewards/MultiModalAccuracyORM/std": 0.29793586432933805, "step": 815, "train_speed(iter/s)": 0.032056 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.7, "completions/mean_length": 339.0166732788086, "completions/min_length": 194.4, "epoch": 0.33131313131313134, "grad_norm": 1.020422275315147, "kl": 0.0112762451171875, "learning_rate": 2e-07, "loss": 0.05103216171264648, "memory(GiB)": 113.5, "reward": 0.22500000670552253, "reward_std": 0.2956440031528473, "rewards/MultiModalAccuracyORM/mean": 0.22500000670552253, "rewards/MultiModalAccuracyORM/std": 0.2956440031528473, "step": 820, "train_speed(iter/s)": 0.032046 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.8, "completions/mean_length": 313.6000061035156, "completions/min_length": 160.3, "epoch": 0.3333333333333333, "grad_norm": 3.530685574433075, "kl": 0.00774993896484375, "learning_rate": 2e-07, "loss": 0.0580863893032074, "memory(GiB)": 113.5, "reward": 0.21666667535901069, "reward_std": 0.31899061501026155, "rewards/MultiModalAccuracyORM/mean": 0.21666667535901069, "rewards/MultiModalAccuracyORM/std": 0.31899061501026155, "step": 825, "train_speed(iter/s)": 0.03206 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.5, "completions/mean_length": 246.05833740234374, "completions/min_length": 134.2, "epoch": 0.33535353535353535, "grad_norm": 0.05179156879937817, "kl": 0.007355499267578125, "learning_rate": 2e-07, "loss": 0.0357688844203949, "memory(GiB)": 113.5, "reward": 0.2750000074505806, "reward_std": 0.20817729830741882, "rewards/MultiModalAccuracyORM/mean": 0.2750000074505806, "rewards/MultiModalAccuracyORM/std": 0.20817729830741882, "step": 830, "train_speed(iter/s)": 0.03208 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/mean_length": 445.533349609375, "completions/min_length": 285.9, "epoch": 0.3373737373737374, "grad_norm": 2.4237696716807413, "kl": 0.005771636962890625, "learning_rate": 2e-07, "loss": 0.0007819652557373047, "memory(GiB)": 113.5, "reward": 0.31666666865348814, "reward_std": 0.3596066445112228, "rewards/MultiModalAccuracyORM/mean": 0.31666666865348814, "rewards/MultiModalAccuracyORM/std": 0.3596066445112228, "step": 835, "train_speed(iter/s)": 0.032078 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.1, "completions/mean_length": 283.40834350585936, "completions/min_length": 164.0, "epoch": 0.3393939393939394, "grad_norm": 2.6192745381364615, "kl": 0.00804901123046875, "learning_rate": 2e-07, "loss": 0.04405608177185059, "memory(GiB)": 113.5, "reward": 0.43333334401249884, "reward_std": 0.2840515673160553, "rewards/MultiModalAccuracyORM/mean": 0.43333334401249884, "rewards/MultiModalAccuracyORM/std": 0.2840515673160553, "step": 840, "train_speed(iter/s)": 0.032104 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.4, "completions/mean_length": 347.0500091552734, "completions/min_length": 163.6, "epoch": 0.3414141414141414, "grad_norm": 2.7151298756229827, "kl": 0.00639801025390625, "learning_rate": 2e-07, "loss": -0.004790738224983215, "memory(GiB)": 113.5, "reward": 0.4333333484828472, "reward_std": 0.39859413504600527, "rewards/MultiModalAccuracyORM/mean": 0.4333333484828472, "rewards/MultiModalAccuracyORM/std": 0.39859413504600527, "step": 845, "train_speed(iter/s)": 0.032112 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.8, "completions/mean_length": 273.2666717529297, "completions/min_length": 157.8, "epoch": 0.3434343434343434, "grad_norm": 1.030614252722568, "kl": 0.0097747802734375, "learning_rate": 2e-07, "loss": 0.0008672773838043213, "memory(GiB)": 113.5, "reward": 0.14166667237877845, "reward_std": 0.28624823689460754, "rewards/MultiModalAccuracyORM/mean": 0.14166667237877845, "rewards/MultiModalAccuracyORM/std": 0.28624823689460754, "step": 850, "train_speed(iter/s)": 0.032141 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.9, "completions/mean_length": 196.70833892822264, "completions/min_length": 103.7, "epoch": 0.34545454545454546, "grad_norm": 4.600894892489762, "kl": 0.00904083251953125, "learning_rate": 2e-07, "loss": -0.002990037202835083, "memory(GiB)": 113.5, "reward": 0.35000001043081286, "reward_std": 0.2511145621538162, "rewards/MultiModalAccuracyORM/mean": 0.35000001043081286, "rewards/MultiModalAccuracyORM/std": 0.2511145621538162, "step": 855, "train_speed(iter/s)": 0.0322 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.7, "completions/mean_length": 280.9416793823242, "completions/min_length": 175.6, "epoch": 0.3474747474747475, "grad_norm": 2.236927092635949, "kl": 0.0068603515625, "learning_rate": 2e-07, "loss": 0.034914878010749814, "memory(GiB)": 113.5, "reward": 0.27500000670552255, "reward_std": 0.28004167079925535, "rewards/MultiModalAccuracyORM/mean": 0.27500000670552255, "rewards/MultiModalAccuracyORM/std": 0.28004167079925535, "step": 860, "train_speed(iter/s)": 0.032218 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.4, "completions/mean_length": 340.1083419799805, "completions/min_length": 183.8, "epoch": 0.34949494949494947, "grad_norm": 3.1857406542737943, "kl": 0.00835723876953125, "learning_rate": 2e-07, "loss": 0.019358628988265993, "memory(GiB)": 113.5, "reward": 0.21666667088866234, "reward_std": 0.25585488975048065, "rewards/MultiModalAccuracyORM/mean": 0.21666667088866234, "rewards/MultiModalAccuracyORM/std": 0.25585488975048065, "step": 865, "train_speed(iter/s)": 0.032254 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.8, "completions/mean_length": 342.0833381652832, "completions/min_length": 181.3, "epoch": 0.3515151515151515, "grad_norm": 2.5743781015760714, "kl": 0.00620880126953125, "learning_rate": 2e-07, "loss": -0.019692707061767577, "memory(GiB)": 113.5, "reward": 0.2083333358168602, "reward_std": 0.23004821836948394, "rewards/MultiModalAccuracyORM/mean": 0.2083333358168602, "rewards/MultiModalAccuracyORM/std": 0.23004821836948394, "step": 870, "train_speed(iter/s)": 0.032261 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.4, "completions/mean_length": 339.12501220703126, "completions/min_length": 174.0, "epoch": 0.35353535353535354, "grad_norm": 3.184656199614579, "kl": 0.00756072998046875, "learning_rate": 2e-07, "loss": 0.016688653826713563, "memory(GiB)": 113.5, "reward": 0.39166667610406875, "reward_std": 0.37845527231693266, "rewards/MultiModalAccuracyORM/mean": 0.39166667610406875, "rewards/MultiModalAccuracyORM/std": 0.37845527231693266, "step": 875, "train_speed(iter/s)": 0.032289 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.3, "completions/mean_length": 336.61668090820314, "completions/min_length": 199.1, "epoch": 0.35555555555555557, "grad_norm": 1.8292091239029376, "kl": 0.006783294677734375, "learning_rate": 2e-07, "loss": -0.0035984992980957033, "memory(GiB)": 113.5, "reward": 0.25000000521540644, "reward_std": 0.353110259771347, "rewards/MultiModalAccuracyORM/mean": 0.25000000521540644, "rewards/MultiModalAccuracyORM/std": 0.353110259771347, "step": 880, "train_speed(iter/s)": 0.032303 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 644.7, "completions/mean_length": 367.0833480834961, "completions/min_length": 201.2, "epoch": 0.3575757575757576, "grad_norm": 2.157154554024042, "kl": 0.0074066162109375, "learning_rate": 2e-07, "loss": -0.012543225288391113, "memory(GiB)": 113.5, "reward": 0.2666666693985462, "reward_std": 0.292328941822052, "rewards/MultiModalAccuracyORM/mean": 0.2666666693985462, "rewards/MultiModalAccuracyORM/std": 0.292328941822052, "step": 885, "train_speed(iter/s)": 0.032311 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 677.1, "completions/mean_length": 368.5666778564453, "completions/min_length": 197.0, "epoch": 0.3595959595959596, "grad_norm": 1.8591339481325562, "kl": 0.01016082763671875, "learning_rate": 2e-07, "loss": -0.015211772918701173, "memory(GiB)": 113.5, "reward": 0.22500000670552253, "reward_std": 0.3802089035511017, "rewards/MultiModalAccuracyORM/mean": 0.22500000670552253, "rewards/MultiModalAccuracyORM/std": 0.3802089035511017, "step": 890, "train_speed(iter/s)": 0.032317 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 713.9, "completions/mean_length": 334.3000091552734, "completions/min_length": 169.7, "epoch": 0.3616161616161616, "grad_norm": 1.891661158050905, "kl": 0.00751495361328125, "learning_rate": 2e-07, "loss": 0.057868242263793945, "memory(GiB)": 113.5, "reward": 0.2833333373069763, "reward_std": 0.36168283224105835, "rewards/MultiModalAccuracyORM/mean": 0.2833333373069763, "rewards/MultiModalAccuracyORM/std": 0.36168283224105835, "step": 895, "train_speed(iter/s)": 0.03232 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.6, "completions/mean_length": 367.8083435058594, "completions/min_length": 193.6, "epoch": 0.36363636363636365, "grad_norm": 2.944909454157867, "kl": 0.0079620361328125, "learning_rate": 2e-07, "loss": 0.003379705175757408, "memory(GiB)": 113.5, "reward": 0.17500000670552254, "reward_std": 0.22300148010253906, "rewards/MultiModalAccuracyORM/mean": 0.17500000670552254, "rewards/MultiModalAccuracyORM/std": 0.22300148010253906, "step": 900, "train_speed(iter/s)": 0.032324 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 724.4, "completions/mean_length": 414.4750144958496, "completions/min_length": 241.5, "epoch": 0.3656565656565657, "grad_norm": 1.0572142583091821, "kl": 0.00611724853515625, "learning_rate": 2e-07, "loss": 0.02717306911945343, "memory(GiB)": 113.5, "reward": 0.3083333417773247, "reward_std": 0.27447034418582916, "rewards/MultiModalAccuracyORM/mean": 0.3083333417773247, "rewards/MultiModalAccuracyORM/std": 0.27447034418582916, "step": 905, "train_speed(iter/s)": 0.03233 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.5, "completions/mean_length": 329.6416717529297, "completions/min_length": 166.2, "epoch": 0.36767676767676766, "grad_norm": 1.806687314036588, "kl": 0.0089080810546875, "learning_rate": 2e-07, "loss": 0.010141277313232422, "memory(GiB)": 113.5, "reward": 0.391666679084301, "reward_std": 0.40894138514995576, "rewards/MultiModalAccuracyORM/mean": 0.391666679084301, "rewards/MultiModalAccuracyORM/std": 0.40894138514995576, "step": 910, "train_speed(iter/s)": 0.032344 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.1, "completions/mean_length": 302.3416748046875, "completions/min_length": 159.3, "epoch": 0.3696969696969697, "grad_norm": 2.825820952489286, "kl": 0.0180572509765625, "learning_rate": 2e-07, "loss": 0.011392435431480408, "memory(GiB)": 113.5, "reward": 0.1916666731238365, "reward_std": 0.33297434747219085, "rewards/MultiModalAccuracyORM/mean": 0.1916666731238365, "rewards/MultiModalAccuracyORM/std": 0.33297434747219085, "step": 915, "train_speed(iter/s)": 0.032356 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.4, "completions/mean_length": 327.7416717529297, "completions/min_length": 178.4, "epoch": 0.3717171717171717, "grad_norm": 2.438516683028765, "kl": 0.00719757080078125, "learning_rate": 2e-07, "loss": 0.037606388330459595, "memory(GiB)": 113.5, "reward": 0.23333333879709245, "reward_std": 0.3543280869722366, "rewards/MultiModalAccuracyORM/mean": 0.23333333879709245, "rewards/MultiModalAccuracyORM/std": 0.3543280869722366, "step": 920, "train_speed(iter/s)": 0.032355 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.9, "completions/mean_length": 351.9500076293945, "completions/min_length": 190.7, "epoch": 0.37373737373737376, "grad_norm": 2.1782598398370943, "kl": 0.006235504150390625, "learning_rate": 2e-07, "loss": -0.007940790057182312, "memory(GiB)": 113.5, "reward": 0.2250000096857548, "reward_std": 0.3659113526344299, "rewards/MultiModalAccuracyORM/mean": 0.2250000096857548, "rewards/MultiModalAccuracyORM/std": 0.3659113526344299, "step": 925, "train_speed(iter/s)": 0.032383 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.8, "completions/mean_length": 379.47501373291016, "completions/min_length": 206.3, "epoch": 0.37575757575757573, "grad_norm": 2.304852196233359, "kl": 0.0072235107421875, "learning_rate": 2e-07, "loss": 0.03286640048027038, "memory(GiB)": 113.5, "reward": 0.34166667312383653, "reward_std": 0.44222086369991304, "rewards/MultiModalAccuracyORM/mean": 0.34166667312383653, "rewards/MultiModalAccuracyORM/std": 0.44222086369991304, "step": 930, "train_speed(iter/s)": 0.032396 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.6, "completions/mean_length": 323.35000762939455, "completions/min_length": 192.7, "epoch": 0.37777777777777777, "grad_norm": 2.9791049041845494, "kl": 0.01037445068359375, "learning_rate": 2e-07, "loss": -0.007777485251426697, "memory(GiB)": 113.5, "reward": 0.25000000447034837, "reward_std": 0.35737437903881075, "rewards/MultiModalAccuracyORM/mean": 0.25000000447034837, "rewards/MultiModalAccuracyORM/std": 0.35737437903881075, "step": 935, "train_speed(iter/s)": 0.032395 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.9, "completions/mean_length": 280.97500762939455, "completions/min_length": 138.2, "epoch": 0.3797979797979798, "grad_norm": 1.881006300919645, "kl": 0.013104248046875, "learning_rate": 2e-07, "loss": 0.02690579891204834, "memory(GiB)": 113.5, "reward": 0.29166667759418485, "reward_std": 0.337774270772934, "rewards/MultiModalAccuracyORM/mean": 0.29166667759418485, "rewards/MultiModalAccuracyORM/std": 0.337774270772934, "step": 940, "train_speed(iter/s)": 0.032434 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.1, "completions/mean_length": 295.9500076293945, "completions/min_length": 174.9, "epoch": 0.38181818181818183, "grad_norm": 3.313912407777126, "kl": 0.00870513916015625, "learning_rate": 2e-07, "loss": -0.032750940322875975, "memory(GiB)": 113.5, "reward": 0.40000000819563863, "reward_std": 0.45158345997333527, "rewards/MultiModalAccuracyORM/mean": 0.40000000819563863, "rewards/MultiModalAccuracyORM/std": 0.45158345997333527, "step": 945, "train_speed(iter/s)": 0.032469 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.3, "completions/mean_length": 262.3000038146973, "completions/min_length": 133.7, "epoch": 0.3838383838383838, "grad_norm": 2.7566340852478053, "kl": 0.00853729248046875, "learning_rate": 2e-07, "loss": 0.018448495864868165, "memory(GiB)": 113.5, "reward": 0.4083333440124989, "reward_std": 0.2674977511167526, "rewards/MultiModalAccuracyORM/mean": 0.4083333440124989, "rewards/MultiModalAccuracyORM/std": 0.2674977511167526, "step": 950, "train_speed(iter/s)": 0.032489 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 658.0, "completions/mean_length": 363.36668090820314, "completions/min_length": 190.9, "epoch": 0.38585858585858585, "grad_norm": 1.6957648809966595, "kl": 0.0067291259765625, "learning_rate": 2e-07, "loss": -0.02898831069469452, "memory(GiB)": 113.5, "reward": 0.2000000074505806, "reward_std": 0.32902404963970183, "rewards/MultiModalAccuracyORM/mean": 0.2000000074505806, "rewards/MultiModalAccuracyORM/std": 0.32902404963970183, "step": 955, "train_speed(iter/s)": 0.032506 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.1, "completions/mean_length": 307.16667022705076, "completions/min_length": 173.2, "epoch": 0.3878787878787879, "grad_norm": 2.6324617057971755, "kl": 0.0082183837890625, "learning_rate": 2e-07, "loss": 0.010876613110303879, "memory(GiB)": 113.5, "reward": 0.2916666708886623, "reward_std": 0.3953502655029297, "rewards/MultiModalAccuracyORM/mean": 0.2916666708886623, "rewards/MultiModalAccuracyORM/std": 0.3953502655029297, "step": 960, "train_speed(iter/s)": 0.032526 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.3, "completions/mean_length": 310.78333892822263, "completions/min_length": 176.0, "epoch": 0.3898989898989899, "grad_norm": 0.25204548209314886, "kl": 0.01051025390625, "learning_rate": 2e-07, "loss": 0.05701416730880737, "memory(GiB)": 113.5, "reward": 0.2500000029802322, "reward_std": 0.2885732680559158, "rewards/MultiModalAccuracyORM/mean": 0.2500000029802322, "rewards/MultiModalAccuracyORM/std": 0.2885732680559158, "step": 965, "train_speed(iter/s)": 0.032526 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.1, "completions/mean_length": 354.4916687011719, "completions/min_length": 207.3, "epoch": 0.39191919191919194, "grad_norm": 1.8105174117337208, "kl": 0.00800018310546875, "learning_rate": 2e-07, "loss": 0.008932539820671081, "memory(GiB)": 113.5, "reward": 0.18333333730697632, "reward_std": 0.3538196414709091, "rewards/MultiModalAccuracyORM/mean": 0.18333333730697632, "rewards/MultiModalAccuracyORM/std": 0.3538196414709091, "step": 970, "train_speed(iter/s)": 0.032536 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.5, "completions/mean_length": 375.68334503173827, "completions/min_length": 236.0, "epoch": 0.3939393939393939, "grad_norm": 1.4251930411180411, "kl": 0.00720672607421875, "learning_rate": 2e-07, "loss": -0.04558621346950531, "memory(GiB)": 113.5, "reward": 0.10833333507180214, "reward_std": 0.2549058347940445, "rewards/MultiModalAccuracyORM/mean": 0.10833333507180214, "rewards/MultiModalAccuracyORM/std": 0.2549058347940445, "step": 975, "train_speed(iter/s)": 0.032554 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 703.2, "completions/mean_length": 440.5833435058594, "completions/min_length": 218.4, "epoch": 0.39595959595959596, "grad_norm": 1.8329415728640532, "kl": 0.0074310302734375, "learning_rate": 2e-07, "loss": -0.004531031847000122, "memory(GiB)": 113.5, "reward": 0.2666666738688946, "reward_std": 0.351182359457016, "rewards/MultiModalAccuracyORM/mean": 0.2666666738688946, "rewards/MultiModalAccuracyORM/std": 0.351182359457016, "step": 980, "train_speed(iter/s)": 0.032558 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.7, "completions/mean_length": 267.5916748046875, "completions/min_length": 165.0, "epoch": 0.397979797979798, "grad_norm": 2.742069878873229, "kl": 0.05179443359375, "learning_rate": 2e-07, "loss": 0.019256360828876495, "memory(GiB)": 113.5, "reward": 0.33333334028720857, "reward_std": 0.3274982154369354, "rewards/MultiModalAccuracyORM/mean": 0.33333334028720857, "rewards/MultiModalAccuracyORM/std": 0.3274982154369354, "step": 985, "train_speed(iter/s)": 0.032587 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.6, "completions/mean_length": 360.90001068115237, "completions/min_length": 205.4, "epoch": 0.4, "grad_norm": 3.049274715544681, "kl": 0.00958404541015625, "learning_rate": 2e-07, "loss": -0.033705079555511476, "memory(GiB)": 113.5, "reward": 0.31666667610406873, "reward_std": 0.27122942507267, "rewards/MultiModalAccuracyORM/mean": 0.31666667610406873, "rewards/MultiModalAccuracyORM/std": 0.27122942507267, "step": 990, "train_speed(iter/s)": 0.032615 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.2, "completions/mean_length": 330.02501220703124, "completions/min_length": 198.8, "epoch": 0.402020202020202, "grad_norm": 2.6515591125640574, "kl": 0.0110382080078125, "learning_rate": 2e-07, "loss": 0.008444187045097352, "memory(GiB)": 113.5, "reward": 0.41666667982935907, "reward_std": 0.4297270834445953, "rewards/MultiModalAccuracyORM/mean": 0.41666667982935907, "rewards/MultiModalAccuracyORM/std": 0.4297270834445953, "step": 995, "train_speed(iter/s)": 0.032638 }, { "epoch": 0.40404040404040403, "grad_norm": 1.6423776292289114, "learning_rate": 2e-07, "loss": -0.0013245075941085815, "memory(GiB)": 113.5, "step": 1000, "train_speed(iter/s)": 0.032641 }, { "epoch": 0.40404040404040403, "eval_clip_ratio": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 565.38, "eval_completions/mean_length": 346.96667633056643, "eval_completions/min_length": 203.6, "eval_kl": 0.00558807373046875, "eval_loss": 0.016358518972992897, "eval_reward": 0.3083333417773247, "eval_reward_std": 0.3403226917982101, "eval_rewards/MultiModalAccuracyORM/mean": 0.3083333417773247, "eval_rewards/MultiModalAccuracyORM/std": 0.3403226917982101, "eval_runtime": 586.662, "eval_samples_per_second": 0.085, "eval_steps_per_second": 0.009, "step": 1000 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.004166666666666667, "completions/max_length": 608.15, "completions/mean_length": 370.19167709350586, "completions/min_length": 202.85, "epoch": 0.40606060606060607, "grad_norm": 2.014189773891532, "kl": 0.009693145751953125, "learning_rate": 2e-07, "loss": 0.026693809032440185, "memory(GiB)": 113.5, "reward": 0.22500000484287738, "reward_std": 0.2774069786071777, "rewards/MultiModalAccuracyORM/mean": 0.22500000484287738, "rewards/MultiModalAccuracyORM/std": 0.2774069786071777, "step": 1005, "train_speed(iter/s)": 0.031849 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 707.9, "completions/mean_length": 427.22500762939455, "completions/min_length": 238.0, "epoch": 0.4080808080808081, "grad_norm": 2.2096007474060633, "kl": 0.00854034423828125, "learning_rate": 2e-07, "loss": -0.01839480996131897, "memory(GiB)": 113.5, "reward": 0.19166667014360428, "reward_std": 0.23004821836948394, "rewards/MultiModalAccuracyORM/mean": 0.19166667014360428, "rewards/MultiModalAccuracyORM/std": 0.23004821836948394, "step": 1010, "train_speed(iter/s)": 0.031846 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.1, "completions/mean_length": 383.8250106811523, "completions/min_length": 206.1, "epoch": 0.4101010101010101, "grad_norm": 2.360953993727072, "kl": 0.00855560302734375, "learning_rate": 2e-07, "loss": -0.03324509263038635, "memory(GiB)": 113.5, "reward": 0.46666667610406876, "reward_std": 0.36664178371429446, "rewards/MultiModalAccuracyORM/mean": 0.46666667610406876, "rewards/MultiModalAccuracyORM/std": 0.36664178371429446, "step": 1015, "train_speed(iter/s)": 0.031859 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 668.4, "completions/mean_length": 396.28334350585936, "completions/min_length": 187.6, "epoch": 0.4121212121212121, "grad_norm": 1.1532109667394932, "kl": 0.00652008056640625, "learning_rate": 2e-07, "loss": 0.012686711549758912, "memory(GiB)": 113.5, "reward": 0.23333333805203438, "reward_std": 0.3129522502422333, "rewards/MultiModalAccuracyORM/mean": 0.23333333805203438, "rewards/MultiModalAccuracyORM/std": 0.3129522502422333, "step": 1020, "train_speed(iter/s)": 0.031867 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.3, "completions/mean_length": 314.48334045410155, "completions/min_length": 181.4, "epoch": 0.41414141414141414, "grad_norm": 2.3285330234433017, "kl": 0.01016845703125, "learning_rate": 2e-07, "loss": -0.00456441193819046, "memory(GiB)": 113.5, "reward": 0.35000001043081286, "reward_std": 0.36670138239860534, "rewards/MultiModalAccuracyORM/mean": 0.35000001043081286, "rewards/MultiModalAccuracyORM/std": 0.36670138239860534, "step": 1025, "train_speed(iter/s)": 0.03188 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 679.1, "completions/mean_length": 417.8000122070313, "completions/min_length": 228.2, "epoch": 0.4161616161616162, "grad_norm": 4.123945619995185, "kl": 0.0071319580078125, "learning_rate": 2e-07, "loss": -0.015000586211681367, "memory(GiB)": 113.5, "reward": 0.30833334252238276, "reward_std": 0.4016164273023605, "rewards/MultiModalAccuracyORM/mean": 0.30833334252238276, "rewards/MultiModalAccuracyORM/std": 0.4016164273023605, "step": 1030, "train_speed(iter/s)": 0.031877 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.2, "completions/mean_length": 334.6833435058594, "completions/min_length": 201.1, "epoch": 0.41818181818181815, "grad_norm": 1.0210308419459193, "kl": 0.00837860107421875, "learning_rate": 2e-07, "loss": -0.008147723227739333, "memory(GiB)": 113.5, "reward": 0.14166667312383652, "reward_std": 0.14815283417701722, "rewards/MultiModalAccuracyORM/mean": 0.14166667312383652, "rewards/MultiModalAccuracyORM/std": 0.14815283417701722, "step": 1035, "train_speed(iter/s)": 0.03191 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.1, "completions/mean_length": 269.1333374023437, "completions/min_length": 140.9, "epoch": 0.4202020202020202, "grad_norm": 2.4151827408725546, "kl": 0.01279144287109375, "learning_rate": 2e-07, "loss": -0.0017376184463500977, "memory(GiB)": 113.5, "reward": 0.5000000074505806, "reward_std": 0.2591939508914948, "rewards/MultiModalAccuracyORM/mean": 0.5000000074505806, "rewards/MultiModalAccuracyORM/std": 0.2591939508914948, "step": 1040, "train_speed(iter/s)": 0.031912 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 693.0, "completions/mean_length": 420.541682434082, "completions/min_length": 252.2, "epoch": 0.4222222222222222, "grad_norm": 1.5651289466382694, "kl": 0.0089202880859375, "learning_rate": 2e-07, "loss": 0.007678426802158356, "memory(GiB)": 113.5, "reward": 0.07500000074505805, "reward_std": 0.17705594301223754, "rewards/MultiModalAccuracyORM/mean": 0.07500000074505805, "rewards/MultiModalAccuracyORM/std": 0.17705594301223754, "step": 1045, "train_speed(iter/s)": 0.031887 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.3, "completions/mean_length": 382.8750061035156, "completions/min_length": 219.2, "epoch": 0.42424242424242425, "grad_norm": 1.6669744297788438, "kl": 0.010504150390625, "learning_rate": 2e-07, "loss": 0.04403962194919586, "memory(GiB)": 113.5, "reward": 0.1916666716337204, "reward_std": 0.2908295333385468, "rewards/MultiModalAccuracyORM/mean": 0.1916666716337204, "rewards/MultiModalAccuracyORM/std": 0.2908295333385468, "step": 1050, "train_speed(iter/s)": 0.031882 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.7, "completions/mean_length": 340.3916778564453, "completions/min_length": 203.1, "epoch": 0.4262626262626263, "grad_norm": 0.08753771583148155, "kl": 0.006915283203125, "learning_rate": 2e-07, "loss": -0.00030135512351989744, "memory(GiB)": 113.5, "reward": 0.3250000074505806, "reward_std": 0.31046818792819975, "rewards/MultiModalAccuracyORM/mean": 0.3250000074505806, "rewards/MultiModalAccuracyORM/std": 0.31046818792819975, "step": 1055, "train_speed(iter/s)": 0.031895 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 727.3, "completions/mean_length": 407.2916793823242, "completions/min_length": 253.6, "epoch": 0.42828282828282827, "grad_norm": 2.1853625197058877, "kl": 0.01122894287109375, "learning_rate": 2e-07, "loss": -0.009478866308927535, "memory(GiB)": 113.5, "reward": 0.2666666679084301, "reward_std": 0.2940108567476273, "rewards/MultiModalAccuracyORM/mean": 0.2666666679084301, "rewards/MultiModalAccuracyORM/std": 0.2940108567476273, "step": 1060, "train_speed(iter/s)": 0.03188 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 693.8, "completions/mean_length": 379.0666793823242, "completions/min_length": 175.8, "epoch": 0.4303030303030303, "grad_norm": 1.8429441156917366, "kl": 0.0084381103515625, "learning_rate": 2e-07, "loss": 0.008666989207267762, "memory(GiB)": 113.5, "reward": 0.3666666731238365, "reward_std": 0.40242100059986113, "rewards/MultiModalAccuracyORM/mean": 0.3666666731238365, "rewards/MultiModalAccuracyORM/std": 0.40242100059986113, "step": 1065, "train_speed(iter/s)": 0.031897 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.8, "completions/mean_length": 411.7250183105469, "completions/min_length": 222.1, "epoch": 0.43232323232323233, "grad_norm": 1.8025359450969856, "kl": 0.0067352294921875, "learning_rate": 2e-07, "loss": -0.020195412635803222, "memory(GiB)": 113.5, "reward": 0.1166666716337204, "reward_std": 0.1745694547891617, "rewards/MultiModalAccuracyORM/mean": 0.1166666716337204, "rewards/MultiModalAccuracyORM/std": 0.1745694547891617, "step": 1070, "train_speed(iter/s)": 0.031919 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/mean_length": 334.0916748046875, "completions/min_length": 184.4, "epoch": 0.43434343434343436, "grad_norm": 1.6111066415316333, "kl": 0.00709228515625, "learning_rate": 2e-07, "loss": -0.004982185363769531, "memory(GiB)": 113.5, "reward": 0.3583333410322666, "reward_std": 0.27148364782333373, "rewards/MultiModalAccuracyORM/mean": 0.3583333410322666, "rewards/MultiModalAccuracyORM/std": 0.27148364782333373, "step": 1075, "train_speed(iter/s)": 0.031909 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.8, "completions/mean_length": 305.34167404174804, "completions/min_length": 176.1, "epoch": 0.43636363636363634, "grad_norm": 2.273654506806337, "kl": 0.00942840576171875, "learning_rate": 2e-07, "loss": -0.0076661787927150725, "memory(GiB)": 113.5, "reward": 0.2583333417773247, "reward_std": 0.2122136175632477, "rewards/MultiModalAccuracyORM/mean": 0.2583333417773247, "rewards/MultiModalAccuracyORM/std": 0.2122136175632477, "step": 1080, "train_speed(iter/s)": 0.031918 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.9, "completions/mean_length": 298.60000839233396, "completions/min_length": 156.5, "epoch": 0.4383838383838384, "grad_norm": 2.1394215246213495, "kl": 0.0081207275390625, "learning_rate": 2e-07, "loss": 0.01651126444339752, "memory(GiB)": 113.5, "reward": 0.3416666768491268, "reward_std": 0.4186849981546402, "rewards/MultiModalAccuracyORM/mean": 0.3416666768491268, "rewards/MultiModalAccuracyORM/std": 0.4186849981546402, "step": 1085, "train_speed(iter/s)": 0.031921 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.4, "completions/mean_length": 338.9500137329102, "completions/min_length": 196.0, "epoch": 0.4404040404040404, "grad_norm": 2.545454860927846, "kl": 0.00835113525390625, "learning_rate": 2e-07, "loss": 0.04256980717182159, "memory(GiB)": 113.5, "reward": 0.3250000014901161, "reward_std": 0.2712650209665298, "rewards/MultiModalAccuracyORM/mean": 0.3250000014901161, "rewards/MultiModalAccuracyORM/std": 0.2712650209665298, "step": 1090, "train_speed(iter/s)": 0.031917 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.008333333333333333, "completions/max_length": 737.9, "completions/mean_length": 350.0750045776367, "completions/min_length": 177.8, "epoch": 0.44242424242424244, "grad_norm": 1.1515652768332443, "kl": 0.0083038330078125, "learning_rate": 2e-07, "loss": 0.05727236866950989, "memory(GiB)": 113.5, "reward": 0.1333333395421505, "reward_std": 0.22625694572925567, "rewards/MultiModalAccuracyORM/mean": 0.1333333395421505, "rewards/MultiModalAccuracyORM/std": 0.22625694572925567, "step": 1095, "train_speed(iter/s)": 0.031885 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.9, "completions/mean_length": 404.4166793823242, "completions/min_length": 222.3, "epoch": 0.4444444444444444, "grad_norm": 2.0946897692044906, "kl": 0.0076324462890625, "learning_rate": 2e-07, "loss": 0.03506229817867279, "memory(GiB)": 113.5, "reward": 0.45833334177732465, "reward_std": 0.41185393929481506, "rewards/MultiModalAccuracyORM/mean": 0.45833334177732465, "rewards/MultiModalAccuracyORM/std": 0.41185393929481506, "step": 1100, "train_speed(iter/s)": 0.031869 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.2, "completions/mean_length": 237.7166732788086, "completions/min_length": 130.5, "epoch": 0.44646464646464645, "grad_norm": 2.7145244594416837, "kl": 0.0112579345703125, "learning_rate": 2e-07, "loss": -0.004306972026824951, "memory(GiB)": 113.5, "reward": 0.27500000447034834, "reward_std": 0.28853767216205595, "rewards/MultiModalAccuracyORM/mean": 0.27500000447034834, "rewards/MultiModalAccuracyORM/std": 0.28853767216205595, "step": 1105, "train_speed(iter/s)": 0.031861 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.2, "completions/mean_length": 339.60834350585935, "completions/min_length": 189.7, "epoch": 0.4484848484848485, "grad_norm": 1.990851354822169, "kl": 0.04109954833984375, "learning_rate": 2e-07, "loss": 0.01136043295264244, "memory(GiB)": 113.5, "reward": 0.4666666768491268, "reward_std": 0.29859510362148284, "rewards/MultiModalAccuracyORM/mean": 0.4666666768491268, "rewards/MultiModalAccuracyORM/std": 0.29859510362148284, "step": 1110, "train_speed(iter/s)": 0.031854 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.4, "completions/mean_length": 347.5666748046875, "completions/min_length": 218.3, "epoch": 0.4505050505050505, "grad_norm": 2.9268025842966563, "kl": 0.0082672119140625, "learning_rate": 2e-07, "loss": -0.0031023643910884856, "memory(GiB)": 113.5, "reward": 0.35000001192092894, "reward_std": 0.3800142765045166, "rewards/MultiModalAccuracyORM/mean": 0.35000001192092894, "rewards/MultiModalAccuracyORM/std": 0.3800142765045166, "step": 1115, "train_speed(iter/s)": 0.031867 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.2, "completions/mean_length": 336.5333419799805, "completions/min_length": 190.3, "epoch": 0.45252525252525255, "grad_norm": 2.361080053690534, "kl": 0.009368896484375, "learning_rate": 2e-07, "loss": -0.007122965157032013, "memory(GiB)": 113.5, "reward": 0.4000000074505806, "reward_std": 0.25241934359073637, "rewards/MultiModalAccuracyORM/mean": 0.4000000074505806, "rewards/MultiModalAccuracyORM/std": 0.25241934359073637, "step": 1120, "train_speed(iter/s)": 0.031889 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.4, "completions/mean_length": 387.1333465576172, "completions/min_length": 233.6, "epoch": 0.45454545454545453, "grad_norm": 1.323972233543725, "kl": 0.008551025390625, "learning_rate": 2e-07, "loss": 0.03706555962562561, "memory(GiB)": 113.5, "reward": 0.19166667014360428, "reward_std": 0.3109443962574005, "rewards/MultiModalAccuracyORM/mean": 0.19166667014360428, "rewards/MultiModalAccuracyORM/std": 0.3109443962574005, "step": 1125, "train_speed(iter/s)": 0.031889 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.3, "completions/mean_length": 389.6666778564453, "completions/min_length": 238.9, "epoch": 0.45656565656565656, "grad_norm": 0.7314973239006443, "kl": 0.00838775634765625, "learning_rate": 2e-07, "loss": -0.0037152446806430818, "memory(GiB)": 113.5, "reward": 0.32500000223517417, "reward_std": 0.2556006729602814, "rewards/MultiModalAccuracyORM/mean": 0.32500000223517417, "rewards/MultiModalAccuracyORM/std": 0.2556006729602814, "step": 1130, "train_speed(iter/s)": 0.031898 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.1, "completions/mean_length": 376.69167175292966, "completions/min_length": 193.9, "epoch": 0.4585858585858586, "grad_norm": 2.6192210055071947, "kl": 0.008709716796875, "learning_rate": 2e-07, "loss": -0.010700675845146179, "memory(GiB)": 113.5, "reward": 0.2750000089406967, "reward_std": 0.3663875609636307, "rewards/MultiModalAccuracyORM/mean": 0.2750000089406967, "rewards/MultiModalAccuracyORM/std": 0.3663875609636307, "step": 1135, "train_speed(iter/s)": 0.031914 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/mean_length": 373.23334350585935, "completions/min_length": 224.3, "epoch": 0.46060606060606063, "grad_norm": 7.217746674191174, "kl": 0.05963134765625, "learning_rate": 2e-07, "loss": 0.005391424894332886, "memory(GiB)": 113.5, "reward": 0.11666666865348815, "reward_std": 0.255160054564476, "rewards/MultiModalAccuracyORM/mean": 0.11666666865348815, "rewards/MultiModalAccuracyORM/std": 0.255160054564476, "step": 1140, "train_speed(iter/s)": 0.031929 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.7, "completions/mean_length": 264.12500686645507, "completions/min_length": 149.1, "epoch": 0.4626262626262626, "grad_norm": 2.7608756487475525, "kl": 0.014227294921875, "learning_rate": 2e-07, "loss": 0.01821192502975464, "memory(GiB)": 113.5, "reward": 0.3916666731238365, "reward_std": 0.29640085995197296, "rewards/MultiModalAccuracyORM/mean": 0.3916666731238365, "rewards/MultiModalAccuracyORM/std": 0.29640085995197296, "step": 1145, "train_speed(iter/s)": 0.031947 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.1, "completions/mean_length": 274.7416763305664, "completions/min_length": 172.5, "epoch": 0.46464646464646464, "grad_norm": 2.4711978185790886, "kl": 0.1343414306640625, "learning_rate": 2e-07, "loss": 0.017589953541755677, "memory(GiB)": 113.5, "reward": 0.24166667237877845, "reward_std": 0.28959646821022034, "rewards/MultiModalAccuracyORM/mean": 0.24166667237877845, "rewards/MultiModalAccuracyORM/std": 0.28959646821022034, "step": 1150, "train_speed(iter/s)": 0.031968 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.7, "completions/mean_length": 316.32501220703125, "completions/min_length": 170.1, "epoch": 0.4666666666666667, "grad_norm": 2.193137307137183, "kl": 0.00921173095703125, "learning_rate": 2e-07, "loss": 0.03984123468399048, "memory(GiB)": 113.5, "reward": 0.25833334103226663, "reward_std": 0.3578915596008301, "rewards/MultiModalAccuracyORM/mean": 0.25833334103226663, "rewards/MultiModalAccuracyORM/std": 0.3578915596008301, "step": 1155, "train_speed(iter/s)": 0.031978 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.9, "completions/mean_length": 312.7916748046875, "completions/min_length": 192.9, "epoch": 0.4686868686868687, "grad_norm": 2.4370225749301886, "kl": 0.0099822998046875, "learning_rate": 2e-07, "loss": 0.04419963359832764, "memory(GiB)": 113.5, "reward": 0.36666667759418486, "reward_std": 0.34560186266899107, "rewards/MultiModalAccuracyORM/mean": 0.36666667759418486, "rewards/MultiModalAccuracyORM/std": 0.34560186266899107, "step": 1160, "train_speed(iter/s)": 0.031974 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 661.0, "completions/mean_length": 355.541674041748, "completions/min_length": 156.5, "epoch": 0.4707070707070707, "grad_norm": 3.306493843440885, "kl": 0.01005859375, "learning_rate": 2e-07, "loss": 0.021104392409324647, "memory(GiB)": 113.5, "reward": 0.31666667461395265, "reward_std": 0.37450254559516905, "rewards/MultiModalAccuracyORM/mean": 0.31666667461395265, "rewards/MultiModalAccuracyORM/std": 0.37450254559516905, "step": 1165, "train_speed(iter/s)": 0.031991 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.5, "completions/mean_length": 266.8083435058594, "completions/min_length": 159.0, "epoch": 0.4727272727272727, "grad_norm": 3.2381508792172107, "kl": 0.0092529296875, "learning_rate": 2e-07, "loss": 0.0005752682685852051, "memory(GiB)": 113.5, "reward": 0.19166667237877846, "reward_std": 0.29159851372241974, "rewards/MultiModalAccuracyORM/mean": 0.19166667237877846, "rewards/MultiModalAccuracyORM/std": 0.29159851372241974, "step": 1170, "train_speed(iter/s)": 0.032017 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 651.1, "completions/mean_length": 376.39167938232424, "completions/min_length": 220.4, "epoch": 0.47474747474747475, "grad_norm": 1.6637816276606223, "kl": 0.00755615234375, "learning_rate": 2e-07, "loss": 0.04589937329292297, "memory(GiB)": 113.5, "reward": 0.3083333417773247, "reward_std": 0.37851486802101136, "rewards/MultiModalAccuracyORM/mean": 0.3083333417773247, "rewards/MultiModalAccuracyORM/std": 0.37851486802101136, "step": 1175, "train_speed(iter/s)": 0.032023 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 656.9, "completions/mean_length": 448.13334197998046, "completions/min_length": 217.0, "epoch": 0.4767676767676768, "grad_norm": 2.1978602872808075, "kl": 0.0097900390625, "learning_rate": 2e-07, "loss": -0.009159280359745026, "memory(GiB)": 113.5, "reward": 0.3166666738688946, "reward_std": 0.28452777564525605, "rewards/MultiModalAccuracyORM/mean": 0.3166666738688946, "rewards/MultiModalAccuracyORM/std": 0.28452777564525605, "step": 1180, "train_speed(iter/s)": 0.032033 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.5, "completions/mean_length": 289.1250053405762, "completions/min_length": 172.7, "epoch": 0.47878787878787876, "grad_norm": 1.3926480253734976, "kl": 0.0114959716796875, "learning_rate": 2e-07, "loss": 0.009945812821388244, "memory(GiB)": 113.5, "reward": 0.5000000149011612, "reward_std": 0.29630566835403443, "rewards/MultiModalAccuracyORM/mean": 0.5000000149011612, "rewards/MultiModalAccuracyORM/std": 0.29630566835403443, "step": 1185, "train_speed(iter/s)": 0.032058 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.5, "completions/mean_length": 330.6666732788086, "completions/min_length": 149.8, "epoch": 0.4808080808080808, "grad_norm": 1.9719497919311402, "kl": 0.0093994140625, "learning_rate": 2e-07, "loss": 0.0009687811136245728, "memory(GiB)": 113.5, "reward": 0.3083333410322666, "reward_std": 0.3478672981262207, "rewards/MultiModalAccuracyORM/mean": 0.3083333410322666, "rewards/MultiModalAccuracyORM/std": 0.3478672981262207, "step": 1190, "train_speed(iter/s)": 0.032072 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 687.4, "completions/mean_length": 421.73334350585935, "completions/min_length": 236.0, "epoch": 0.48282828282828283, "grad_norm": 1.243534573096356, "kl": 0.0077484130859375, "learning_rate": 2e-07, "loss": -0.003622010350227356, "memory(GiB)": 113.5, "reward": 0.20833333879709243, "reward_std": 0.29815449118614196, "rewards/MultiModalAccuracyORM/mean": 0.20833333879709243, "rewards/MultiModalAccuracyORM/std": 0.29815449118614196, "step": 1195, "train_speed(iter/s)": 0.03208 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.5, "completions/mean_length": 360.5666748046875, "completions/min_length": 198.9, "epoch": 0.48484848484848486, "grad_norm": 2.916364282012391, "kl": 0.0125274658203125, "learning_rate": 2e-07, "loss": -0.021604710817337038, "memory(GiB)": 113.5, "reward": 0.22500000447034835, "reward_std": 0.35037778615951537, "rewards/MultiModalAccuracyORM/mean": 0.22500000447034835, "rewards/MultiModalAccuracyORM/std": 0.35037778615951537, "step": 1200, "train_speed(iter/s)": 0.032073 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/mean_length": 282.54167556762695, "completions/min_length": 161.2, "epoch": 0.4868686868686869, "grad_norm": 1.8011146439004695, "kl": 0.0113494873046875, "learning_rate": 2e-07, "loss": 0.006717947870492935, "memory(GiB)": 113.5, "reward": 0.3333333387970924, "reward_std": 0.19114727079868316, "rewards/MultiModalAccuracyORM/mean": 0.3333333387970924, "rewards/MultiModalAccuracyORM/std": 0.19114727079868316, "step": 1205, "train_speed(iter/s)": 0.032089 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 834.7, "completions/mean_length": 442.2083435058594, "completions/min_length": 237.3, "epoch": 0.4888888888888889, "grad_norm": 1.4349681794675622, "kl": 0.0094146728515625, "learning_rate": 2e-07, "loss": 0.0006179869174957276, "memory(GiB)": 113.5, "reward": 0.2333333410322666, "reward_std": 0.3762586027383804, "rewards/MultiModalAccuracyORM/mean": 0.2333333410322666, "rewards/MultiModalAccuracyORM/std": 0.3762586027383804, "step": 1210, "train_speed(iter/s)": 0.032088 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.008333333333333333, "completions/max_length": 738.7, "completions/mean_length": 433.8916748046875, "completions/min_length": 249.1, "epoch": 0.4909090909090909, "grad_norm": 2.547575664275865, "kl": 0.00975189208984375, "learning_rate": 2e-07, "loss": -0.026794981956481934, "memory(GiB)": 113.5, "reward": 0.18333333656191825, "reward_std": 0.20118070244789124, "rewards/MultiModalAccuracyORM/mean": 0.18333333656191825, "rewards/MultiModalAccuracyORM/std": 0.20118070244789124, "step": 1215, "train_speed(iter/s)": 0.032076 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.008333333333333333, "completions/max_length": 627.1, "completions/mean_length": 359.12500915527346, "completions/min_length": 197.8, "epoch": 0.49292929292929294, "grad_norm": 1.645283475386655, "kl": 0.0115875244140625, "learning_rate": 2e-07, "loss": 0.024244531989097595, "memory(GiB)": 113.5, "reward": 0.3333333387970924, "reward_std": 0.2511145621538162, "rewards/MultiModalAccuracyORM/mean": 0.3333333387970924, "rewards/MultiModalAccuracyORM/std": 0.2511145621538162, "step": 1220, "train_speed(iter/s)": 0.032088 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.3, "completions/mean_length": 267.17501068115234, "completions/min_length": 152.0, "epoch": 0.494949494949495, "grad_norm": 1.2460930349970594, "kl": 0.010992431640625, "learning_rate": 2e-07, "loss": 0.007182718813419342, "memory(GiB)": 113.5, "reward": 0.2583333358168602, "reward_std": 0.2536582201719284, "rewards/MultiModalAccuracyORM/mean": 0.2583333358168602, "rewards/MultiModalAccuracyORM/std": 0.2536582201719284, "step": 1225, "train_speed(iter/s)": 0.032122 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 723.8, "completions/mean_length": 405.7416702270508, "completions/min_length": 233.4, "epoch": 0.49696969696969695, "grad_norm": 3.259598192610936, "kl": 0.0076690673828125, "learning_rate": 2e-07, "loss": 0.0032115459442138674, "memory(GiB)": 113.5, "reward": 0.33333334177732465, "reward_std": 0.3470627248287201, "rewards/MultiModalAccuracyORM/mean": 0.33333334177732465, "rewards/MultiModalAccuracyORM/std": 0.3470627248287201, "step": 1230, "train_speed(iter/s)": 0.032125 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.1, "completions/mean_length": 306.12501068115233, "completions/min_length": 188.9, "epoch": 0.498989898989899, "grad_norm": 0.7719456506471253, "kl": 0.01029205322265625, "learning_rate": 2e-07, "loss": -0.016546979546546936, "memory(GiB)": 113.5, "reward": 0.1916666693985462, "reward_std": 0.2895223259925842, "rewards/MultiModalAccuracyORM/mean": 0.1916666693985462, "rewards/MultiModalAccuracyORM/std": 0.2895223259925842, "step": 1235, "train_speed(iter/s)": 0.032159 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.6, "completions/mean_length": 343.00834503173826, "completions/min_length": 207.8, "epoch": 0.501010101010101, "grad_norm": 1.6531811897726711, "kl": 0.0093170166015625, "learning_rate": 2e-07, "loss": 0.02186596691608429, "memory(GiB)": 113.5, "reward": 0.19166667610406876, "reward_std": 0.2448128044605255, "rewards/MultiModalAccuracyORM/mean": 0.19166667610406876, "rewards/MultiModalAccuracyORM/std": 0.2448128044605255, "step": 1240, "train_speed(iter/s)": 0.032173 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.6, "completions/mean_length": 299.2916717529297, "completions/min_length": 172.4, "epoch": 0.503030303030303, "grad_norm": 3.19592384950856, "kl": 0.0099273681640625, "learning_rate": 2e-07, "loss": -0.006601364910602569, "memory(GiB)": 113.5, "reward": 0.2083333410322666, "reward_std": 0.3292782694101334, "rewards/MultiModalAccuracyORM/mean": 0.2083333410322666, "rewards/MultiModalAccuracyORM/std": 0.3292782694101334, "step": 1245, "train_speed(iter/s)": 0.032193 }, { "epoch": 0.5050505050505051, "grad_norm": 0.912303521551538, "learning_rate": 2e-07, "loss": -0.0002701073884963989, "memory(GiB)": 113.5, "step": 1250, "train_speed(iter/s)": 0.032205 }, { "epoch": 0.5050505050505051, "eval_clip_ratio": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 570.74, "eval_completions/mean_length": 352.94834228515623, "eval_completions/min_length": 210.42, "eval_kl": 0.00790496826171875, "eval_loss": 0.01708856225013733, "eval_reward": 0.2983333393931389, "eval_reward_std": 0.3327623122930527, "eval_rewards/MultiModalAccuracyORM/mean": 0.2983333393931389, "eval_rewards/MultiModalAccuracyORM/std": 0.3327623122930527, "eval_runtime": 568.068, "eval_samples_per_second": 0.088, "eval_steps_per_second": 0.009, "step": 1250 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.8, "completions/mean_length": 358.1875087738037, "completions/min_length": 223.5, "epoch": 0.5070707070707071, "grad_norm": 1.8888348326711508, "kl": 0.01190643310546875, "learning_rate": 2e-07, "loss": 0.019428746402263643, "memory(GiB)": 113.5, "reward": 0.27916667275130747, "reward_std": 0.38802969008684157, "rewards/MultiModalAccuracyORM/mean": 0.27916667275130747, "rewards/MultiModalAccuracyORM/std": 0.38802969008684157, "step": 1255, "train_speed(iter/s)": 0.031527 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.7, "completions/mean_length": 373.8333465576172, "completions/min_length": 225.6, "epoch": 0.509090909090909, "grad_norm": 1.8028043067539863, "kl": 0.0100677490234375, "learning_rate": 2e-07, "loss": 0.027076438069343567, "memory(GiB)": 113.5, "reward": 0.19166667386889458, "reward_std": 0.3207202464342117, "rewards/MultiModalAccuracyORM/mean": 0.19166667386889458, "rewards/MultiModalAccuracyORM/std": 0.3207202464342117, "step": 1260, "train_speed(iter/s)": 0.031526 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 690.3, "completions/mean_length": 397.9250152587891, "completions/min_length": 204.6, "epoch": 0.5111111111111111, "grad_norm": 2.2225728768142723, "kl": 0.011029052734375, "learning_rate": 2e-07, "loss": -0.04860515892505646, "memory(GiB)": 113.5, "reward": 0.29166667684912684, "reward_std": 0.33303394317626955, "rewards/MultiModalAccuracyORM/mean": 0.29166667684912684, "rewards/MultiModalAccuracyORM/std": 0.33303394317626955, "step": 1265, "train_speed(iter/s)": 0.031521 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.6, "completions/mean_length": 342.73334350585935, "completions/min_length": 191.1, "epoch": 0.5131313131313131, "grad_norm": 1.467354462173463, "kl": 0.0107818603515625, "learning_rate": 2e-07, "loss": 0.03341347873210907, "memory(GiB)": 113.5, "reward": 0.34166667982935905, "reward_std": 0.2812868595123291, "rewards/MultiModalAccuracyORM/mean": 0.34166667982935905, "rewards/MultiModalAccuracyORM/std": 0.2812868595123291, "step": 1270, "train_speed(iter/s)": 0.031536 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.8, "completions/mean_length": 283.72500762939455, "completions/min_length": 163.8, "epoch": 0.5151515151515151, "grad_norm": 3.716342095943599, "kl": 0.014361572265625, "learning_rate": 2e-07, "loss": 0.02838865518569946, "memory(GiB)": 113.5, "reward": 0.433333345502615, "reward_std": 0.3993005663156509, "rewards/MultiModalAccuracyORM/mean": 0.433333345502615, "rewards/MultiModalAccuracyORM/std": 0.3993005663156509, "step": 1275, "train_speed(iter/s)": 0.031554 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.3, "completions/mean_length": 302.8583389282227, "completions/min_length": 167.6, "epoch": 0.5171717171717172, "grad_norm": 2.503627797323309, "kl": 0.0103668212890625, "learning_rate": 2e-07, "loss": 0.00705558955669403, "memory(GiB)": 113.5, "reward": 0.6333333551883698, "reward_std": 0.43680969774723055, "rewards/MultiModalAccuracyORM/mean": 0.6333333551883698, "rewards/MultiModalAccuracyORM/std": 0.43680969774723055, "step": 1280, "train_speed(iter/s)": 0.031583 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.5, "completions/mean_length": 403.7000076293945, "completions/min_length": 185.7, "epoch": 0.5191919191919192, "grad_norm": 3.2251809838929315, "kl": 0.0101654052734375, "learning_rate": 2e-07, "loss": -0.037446904182434085, "memory(GiB)": 113.5, "reward": 0.3083333395421505, "reward_std": 0.3978011578321457, "rewards/MultiModalAccuracyORM/mean": 0.3083333395421505, "rewards/MultiModalAccuracyORM/std": 0.3978011578321457, "step": 1285, "train_speed(iter/s)": 0.031592 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.7, "completions/mean_length": 300.60834045410155, "completions/min_length": 158.4, "epoch": 0.5212121212121212, "grad_norm": 1.7359935662818697, "kl": 0.0238189697265625, "learning_rate": 2e-07, "loss": 0.005645626783370971, "memory(GiB)": 113.5, "reward": 0.4916666761040688, "reward_std": 0.37272491455078127, "rewards/MultiModalAccuracyORM/mean": 0.4916666761040688, "rewards/MultiModalAccuracyORM/std": 0.37272491455078127, "step": 1290, "train_speed(iter/s)": 0.031618 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.008333333333333333, "completions/max_length": 658.6, "completions/mean_length": 381.3166809082031, "completions/min_length": 189.5, "epoch": 0.5232323232323233, "grad_norm": 2.291063786312688, "kl": 0.0126251220703125, "learning_rate": 2e-07, "loss": 0.03457438945770264, "memory(GiB)": 113.5, "reward": 0.29166667312383654, "reward_std": 0.3760043799877167, "rewards/MultiModalAccuracyORM/mean": 0.29166667312383654, "rewards/MultiModalAccuracyORM/std": 0.3760043799877167, "step": 1295, "train_speed(iter/s)": 0.031626 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.2, "completions/mean_length": 334.5833480834961, "completions/min_length": 191.9, "epoch": 0.5252525252525253, "grad_norm": 2.290129258389379, "kl": 0.0095916748046875, "learning_rate": 2e-07, "loss": -0.024787557125091553, "memory(GiB)": 113.5, "reward": 0.37500000894069674, "reward_std": 0.29634126722812654, "rewards/MultiModalAccuracyORM/mean": 0.37500000894069674, "rewards/MultiModalAccuracyORM/std": 0.29634126722812654, "step": 1300, "train_speed(iter/s)": 0.031656 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.6, "completions/mean_length": 374.36668395996094, "completions/min_length": 192.7, "epoch": 0.5272727272727272, "grad_norm": 2.0627127342369556, "kl": 0.0099151611328125, "learning_rate": 2e-07, "loss": 0.004585762321949005, "memory(GiB)": 113.5, "reward": 0.2250000037252903, "reward_std": 0.40560232698917387, "rewards/MultiModalAccuracyORM/mean": 0.2250000037252903, "rewards/MultiModalAccuracyORM/std": 0.40560232698917387, "step": 1305, "train_speed(iter/s)": 0.031669 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.008333333333333333, "completions/max_length": 569.9, "completions/mean_length": 281.84167556762696, "completions/min_length": 151.0, "epoch": 0.5292929292929293, "grad_norm": 1.2890377388651022, "kl": 0.012725830078125, "learning_rate": 2e-07, "loss": -0.00015339255332946777, "memory(GiB)": 113.5, "reward": 0.341666679084301, "reward_std": 0.31068681478500365, "rewards/MultiModalAccuracyORM/mean": 0.341666679084301, "rewards/MultiModalAccuracyORM/std": 0.31068681478500365, "step": 1310, "train_speed(iter/s)": 0.031675 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.4, "completions/mean_length": 307.5916717529297, "completions/min_length": 186.8, "epoch": 0.5313131313131313, "grad_norm": 2.0391373518251648, "kl": 0.0098358154296875, "learning_rate": 2e-07, "loss": 0.06573413610458374, "memory(GiB)": 113.5, "reward": 0.28333333805203437, "reward_std": 0.351182359457016, "rewards/MultiModalAccuracyORM/mean": 0.28333333805203437, "rewards/MultiModalAccuracyORM/std": 0.351182359457016, "step": 1315, "train_speed(iter/s)": 0.031691 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/mean_length": 273.75001068115233, "completions/min_length": 144.0, "epoch": 0.5333333333333333, "grad_norm": 2.059578451448539, "kl": 0.01175537109375, "learning_rate": 2e-07, "loss": 0.04888114631175995, "memory(GiB)": 113.5, "reward": 0.3500000096857548, "reward_std": 0.4166352391242981, "rewards/MultiModalAccuracyORM/mean": 0.3500000096857548, "rewards/MultiModalAccuracyORM/std": 0.4166352391242981, "step": 1320, "train_speed(iter/s)": 0.031715 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.8, "completions/mean_length": 376.00000915527346, "completions/min_length": 193.4, "epoch": 0.5353535353535354, "grad_norm": 2.604163597368088, "kl": 0.014556884765625, "learning_rate": 2e-07, "loss": 0.025493156909942628, "memory(GiB)": 113.5, "reward": 0.2833333432674408, "reward_std": 0.33376437425613403, "rewards/MultiModalAccuracyORM/mean": 0.2833333432674408, "rewards/MultiModalAccuracyORM/std": 0.33376437425613403, "step": 1325, "train_speed(iter/s)": 0.031732 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.3, "completions/mean_length": 319.8833465576172, "completions/min_length": 185.2, "epoch": 0.5373737373737374, "grad_norm": 2.939920293327845, "kl": 0.0071502685546875, "learning_rate": 2e-07, "loss": -0.0020159482955932617, "memory(GiB)": 113.5, "reward": 0.2500000037252903, "reward_std": 0.33000870048999786, "rewards/MultiModalAccuracyORM/mean": 0.2500000037252903, "rewards/MultiModalAccuracyORM/std": 0.33000870048999786, "step": 1330, "train_speed(iter/s)": 0.031756 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.3, "completions/mean_length": 409.9833419799805, "completions/min_length": 273.6, "epoch": 0.5393939393939394, "grad_norm": 1.8600557381971845, "kl": 0.011651611328125, "learning_rate": 2e-07, "loss": 0.014566189050674439, "memory(GiB)": 113.5, "reward": 0.2500000029802322, "reward_std": 0.36642315685749055, "rewards/MultiModalAccuracyORM/mean": 0.2500000029802322, "rewards/MultiModalAccuracyORM/std": 0.36642315685749055, "step": 1335, "train_speed(iter/s)": 0.031764 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/mean_length": 304.6666717529297, "completions/min_length": 186.0, "epoch": 0.5414141414141415, "grad_norm": 2.250726659882682, "kl": 0.0128570556640625, "learning_rate": 2e-07, "loss": 0.0025543123483657837, "memory(GiB)": 113.5, "reward": 0.1416666679084301, "reward_std": 0.24939410090446473, "rewards/MultiModalAccuracyORM/mean": 0.1416666679084301, "rewards/MultiModalAccuracyORM/std": 0.24939410090446473, "step": 1340, "train_speed(iter/s)": 0.031764 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.8, "completions/mean_length": 320.21667633056643, "completions/min_length": 156.5, "epoch": 0.5434343434343434, "grad_norm": 3.49354443152123, "kl": 0.01195068359375, "learning_rate": 2e-07, "loss": -0.026122617721557616, "memory(GiB)": 113.5, "reward": 0.10833333656191826, "reward_std": 0.2714240521192551, "rewards/MultiModalAccuracyORM/mean": 0.10833333656191826, "rewards/MultiModalAccuracyORM/std": 0.2714240521192551, "step": 1345, "train_speed(iter/s)": 0.031794 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/mean_length": 359.27501220703124, "completions/min_length": 227.5, "epoch": 0.5454545454545454, "grad_norm": 1.3573423115932872, "kl": 0.008941650390625, "learning_rate": 2e-07, "loss": 0.02098418176174164, "memory(GiB)": 113.5, "reward": 0.30833334401249884, "reward_std": 0.3207202464342117, "rewards/MultiModalAccuracyORM/mean": 0.30833334401249884, "rewards/MultiModalAccuracyORM/std": 0.3207202464342117, "step": 1350, "train_speed(iter/s)": 0.031817 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.3, "completions/mean_length": 368.5750106811523, "completions/min_length": 194.4, "epoch": 0.5474747474747474, "grad_norm": 1.268246321814541, "kl": 0.011553955078125, "learning_rate": 2e-07, "loss": -0.037621939182281496, "memory(GiB)": 113.5, "reward": 0.3083333447575569, "reward_std": 0.3823301374912262, "rewards/MultiModalAccuracyORM/mean": 0.3083333447575569, "rewards/MultiModalAccuracyORM/std": 0.3823301374912262, "step": 1355, "train_speed(iter/s)": 0.031821 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 668.5, "completions/mean_length": 337.325008392334, "completions/min_length": 166.7, "epoch": 0.5494949494949495, "grad_norm": 1.1405744205796668, "kl": 0.0093414306640625, "learning_rate": 2e-07, "loss": 0.05270506143569946, "memory(GiB)": 113.5, "reward": 0.5333333417773247, "reward_std": 0.30996555387973784, "rewards/MultiModalAccuracyORM/mean": 0.5333333417773247, "rewards/MultiModalAccuracyORM/std": 0.30996555387973784, "step": 1360, "train_speed(iter/s)": 0.031838 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.6, "completions/mean_length": 287.0166732788086, "completions/min_length": 144.9, "epoch": 0.5515151515151515, "grad_norm": 2.505246091989759, "kl": 0.0113037109375, "learning_rate": 2e-07, "loss": -0.027878284454345703, "memory(GiB)": 113.5, "reward": 0.24166667833924294, "reward_std": 0.34710127115249634, "rewards/MultiModalAccuracyORM/mean": 0.24166667833924294, "rewards/MultiModalAccuracyORM/std": 0.34710127115249634, "step": 1365, "train_speed(iter/s)": 0.031858 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.9, "completions/mean_length": 330.1916793823242, "completions/min_length": 153.1, "epoch": 0.5535353535353535, "grad_norm": 3.131582003300663, "kl": 0.0138336181640625, "learning_rate": 2e-07, "loss": -0.0038233429193496706, "memory(GiB)": 113.5, "reward": 0.31666667982935903, "reward_std": 0.37345829904079436, "rewards/MultiModalAccuracyORM/mean": 0.31666667982935903, "rewards/MultiModalAccuracyORM/std": 0.37345829904079436, "step": 1370, "train_speed(iter/s)": 0.031869 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.4, "completions/mean_length": 286.3666702270508, "completions/min_length": 149.1, "epoch": 0.5555555555555556, "grad_norm": 1.9956984289591713, "kl": 0.0112335205078125, "learning_rate": 2e-07, "loss": -0.012190797924995422, "memory(GiB)": 113.5, "reward": 0.40833333879709244, "reward_std": 0.3855446308851242, "rewards/MultiModalAccuracyORM/mean": 0.40833333879709244, "rewards/MultiModalAccuracyORM/std": 0.3855446308851242, "step": 1375, "train_speed(iter/s)": 0.031897 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.7, "completions/mean_length": 324.9166763305664, "completions/min_length": 202.3, "epoch": 0.5575757575757576, "grad_norm": 2.1303502921633335, "kl": 0.00983428955078125, "learning_rate": 2e-07, "loss": 0.02664785385131836, "memory(GiB)": 113.5, "reward": 0.40833333879709244, "reward_std": 0.30971133410930635, "rewards/MultiModalAccuracyORM/mean": 0.40833333879709244, "rewards/MultiModalAccuracyORM/std": 0.30971133410930635, "step": 1380, "train_speed(iter/s)": 0.031915 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.6, "completions/mean_length": 356.1666793823242, "completions/min_length": 201.3, "epoch": 0.5595959595959596, "grad_norm": 2.9931509831712524, "kl": 0.012689208984375, "learning_rate": 2e-07, "loss": -0.039350539445877075, "memory(GiB)": 113.5, "reward": 0.1666666716337204, "reward_std": 0.2917931377887726, "rewards/MultiModalAccuracyORM/mean": 0.1666666716337204, "rewards/MultiModalAccuracyORM/std": 0.2917931377887726, "step": 1385, "train_speed(iter/s)": 0.031924 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/mean_length": 356.6083358764648, "completions/min_length": 193.0, "epoch": 0.5616161616161616, "grad_norm": 2.198573582527943, "kl": 0.008941650390625, "learning_rate": 2e-07, "loss": -0.022810643911361693, "memory(GiB)": 113.5, "reward": 0.35000001415610316, "reward_std": 0.32673218548297883, "rewards/MultiModalAccuracyORM/mean": 0.35000001415610316, "rewards/MultiModalAccuracyORM/std": 0.32673218548297883, "step": 1390, "train_speed(iter/s)": 0.031929 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/mean_length": 314.82500686645506, "completions/min_length": 182.1, "epoch": 0.5636363636363636, "grad_norm": 2.150533068523157, "kl": 0.0106353759765625, "learning_rate": 2e-07, "loss": -0.013728708028793335, "memory(GiB)": 113.5, "reward": 0.46666667312383653, "reward_std": 0.25897532403469087, "rewards/MultiModalAccuracyORM/mean": 0.46666667312383653, "rewards/MultiModalAccuracyORM/std": 0.25897532403469087, "step": 1395, "train_speed(iter/s)": 0.031954 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.3, "completions/mean_length": 353.3833465576172, "completions/min_length": 204.6, "epoch": 0.5656565656565656, "grad_norm": 1.940941493471918, "kl": 0.0095428466796875, "learning_rate": 2e-07, "loss": -0.006394821405410767, "memory(GiB)": 113.5, "reward": 0.3916666753590107, "reward_std": 0.34550372064113616, "rewards/MultiModalAccuracyORM/mean": 0.3916666753590107, "rewards/MultiModalAccuracyORM/std": 0.34550372064113616, "step": 1400, "train_speed(iter/s)": 0.031962 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 705.4, "completions/mean_length": 432.6000045776367, "completions/min_length": 228.0, "epoch": 0.5676767676767677, "grad_norm": 2.4378327864764286, "kl": 0.011553955078125, "learning_rate": 2e-07, "loss": 0.04005226194858551, "memory(GiB)": 113.5, "reward": 0.2916666746139526, "reward_std": 0.3370794355869293, "rewards/MultiModalAccuracyORM/mean": 0.2916666746139526, "rewards/MultiModalAccuracyORM/std": 0.3370794355869293, "step": 1405, "train_speed(iter/s)": 0.031967 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 689.2, "completions/mean_length": 384.7333465576172, "completions/min_length": 220.2, "epoch": 0.5696969696969697, "grad_norm": 0.780805540568698, "kl": 0.01131591796875, "learning_rate": 2e-07, "loss": 0.03709500730037689, "memory(GiB)": 113.5, "reward": 0.33333333730697634, "reward_std": 0.3572298943996429, "rewards/MultiModalAccuracyORM/mean": 0.33333333730697634, "rewards/MultiModalAccuracyORM/std": 0.3572298943996429, "step": 1410, "train_speed(iter/s)": 0.031973 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/mean_length": 321.6333465576172, "completions/min_length": 164.9, "epoch": 0.5717171717171717, "grad_norm": 1.430362343806847, "kl": 0.0101104736328125, "learning_rate": 2e-07, "loss": 0.013754424452781678, "memory(GiB)": 113.5, "reward": 0.2583333387970924, "reward_std": 0.28555097579956057, "rewards/MultiModalAccuracyORM/mean": 0.2583333387970924, "rewards/MultiModalAccuracyORM/std": 0.28555097579956057, "step": 1415, "train_speed(iter/s)": 0.031992 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.008333333333333333, "completions/max_length": 816.5, "completions/mean_length": 438.3833435058594, "completions/min_length": 264.8, "epoch": 0.5737373737373738, "grad_norm": 1.6263448971015675, "kl": 0.010430908203125, "learning_rate": 2e-07, "loss": -0.0029776930809020997, "memory(GiB)": 113.5, "reward": 0.2333333395421505, "reward_std": 0.3883536756038666, "rewards/MultiModalAccuracyORM/mean": 0.2333333395421505, "rewards/MultiModalAccuracyORM/std": 0.3883536756038666, "step": 1420, "train_speed(iter/s)": 0.031979 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.008333333333333333, "completions/max_length": 766.6, "completions/mean_length": 387.12500915527346, "completions/min_length": 211.4, "epoch": 0.5757575757575758, "grad_norm": 2.1728432922463274, "kl": 0.0098236083984375, "learning_rate": 2e-07, "loss": -0.004918041825294495, "memory(GiB)": 113.5, "reward": 0.23333333656191826, "reward_std": 0.10697162449359894, "rewards/MultiModalAccuracyORM/mean": 0.23333333656191826, "rewards/MultiModalAccuracyORM/std": 0.10697162449359894, "step": 1425, "train_speed(iter/s)": 0.031977 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.8, "completions/mean_length": 388.7416793823242, "completions/min_length": 235.4, "epoch": 0.5777777777777777, "grad_norm": 1.7935893801244052, "kl": 0.0087493896484375, "learning_rate": 2e-07, "loss": 0.04609963297843933, "memory(GiB)": 113.5, "reward": 0.3500000134110451, "reward_std": 0.32297651171684266, "rewards/MultiModalAccuracyORM/mean": 0.3500000134110451, "rewards/MultiModalAccuracyORM/std": 0.32297651171684266, "step": 1430, "train_speed(iter/s)": 0.031989 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 682.9, "completions/mean_length": 398.00001220703126, "completions/min_length": 208.4, "epoch": 0.5797979797979798, "grad_norm": 2.549829840865519, "kl": 0.010107421875, "learning_rate": 2e-07, "loss": -0.0018973067402839662, "memory(GiB)": 113.5, "reward": 0.4250000089406967, "reward_std": 0.3973225235939026, "rewards/MultiModalAccuracyORM/mean": 0.4250000089406967, "rewards/MultiModalAccuracyORM/std": 0.3973225235939026, "step": 1435, "train_speed(iter/s)": 0.031994 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.2, "completions/mean_length": 325.5333450317383, "completions/min_length": 163.1, "epoch": 0.5818181818181818, "grad_norm": 2.800120485549645, "kl": 0.0125640869140625, "learning_rate": 2e-07, "loss": -0.016949039697647095, "memory(GiB)": 113.5, "reward": 0.4166666716337204, "reward_std": 0.34232239723205565, "rewards/MultiModalAccuracyORM/mean": 0.4166666716337204, "rewards/MultiModalAccuracyORM/std": 0.34232239723205565, "step": 1440, "train_speed(iter/s)": 0.032007 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.5, "completions/mean_length": 359.07500915527345, "completions/min_length": 200.2, "epoch": 0.5838383838383838, "grad_norm": 2.2400645386442526, "kl": 0.0367034912109375, "learning_rate": 2e-07, "loss": 0.027681028842926024, "memory(GiB)": 113.5, "reward": 0.29166667312383654, "reward_std": 0.29815449118614196, "rewards/MultiModalAccuracyORM/mean": 0.29166667312383654, "rewards/MultiModalAccuracyORM/std": 0.29815449118614196, "step": 1445, "train_speed(iter/s)": 0.032019 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 663.8, "completions/mean_length": 411.00001068115233, "completions/min_length": 244.1, "epoch": 0.5858585858585859, "grad_norm": 2.864884904580614, "kl": 0.009783935546875, "learning_rate": 2e-07, "loss": 0.00823460817337036, "memory(GiB)": 113.5, "reward": 0.3416666768491268, "reward_std": 0.3438218057155609, "rewards/MultiModalAccuracyORM/mean": 0.3416666768491268, "rewards/MultiModalAccuracyORM/std": 0.3438218057155609, "step": 1450, "train_speed(iter/s)": 0.032023 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.1, "completions/mean_length": 335.3583465576172, "completions/min_length": 205.1, "epoch": 0.5878787878787879, "grad_norm": 1.4688157931726233, "kl": 0.0092010498046875, "learning_rate": 2e-07, "loss": 0.01696823239326477, "memory(GiB)": 113.5, "reward": 0.37500000968575475, "reward_std": 0.35413345992565154, "rewards/MultiModalAccuracyORM/mean": 0.37500000968575475, "rewards/MultiModalAccuracyORM/std": 0.35413345992565154, "step": 1455, "train_speed(iter/s)": 0.03204 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 827.3, "completions/mean_length": 473.9416748046875, "completions/min_length": 249.7, "epoch": 0.5898989898989899, "grad_norm": 1.1646459187041633, "kl": 0.0099945068359375, "learning_rate": 2e-07, "loss": 0.014775393903255463, "memory(GiB)": 113.5, "reward": 0.2666666738688946, "reward_std": 0.30333785712718964, "rewards/MultiModalAccuracyORM/mean": 0.2666666738688946, "rewards/MultiModalAccuracyORM/std": 0.30333785712718964, "step": 1460, "train_speed(iter/s)": 0.032026 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.008333333333333333, "completions/max_length": 701.1, "completions/mean_length": 384.5083404541016, "completions/min_length": 204.1, "epoch": 0.591919191919192, "grad_norm": 0.04302173761513684, "kl": 0.012548828125, "learning_rate": 2e-07, "loss": -0.001154869794845581, "memory(GiB)": 113.5, "reward": 0.3000000141561031, "reward_std": 0.3127244532108307, "rewards/MultiModalAccuracyORM/mean": 0.3000000141561031, "rewards/MultiModalAccuracyORM/std": 0.3127244532108307, "step": 1465, "train_speed(iter/s)": 0.032029 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.3, "completions/mean_length": 352.4416763305664, "completions/min_length": 195.1, "epoch": 0.593939393939394, "grad_norm": 2.051161125641378, "kl": 0.014813232421875, "learning_rate": 2e-07, "loss": 0.0119085431098938, "memory(GiB)": 113.5, "reward": 0.3083333395421505, "reward_std": 0.34488060176372526, "rewards/MultiModalAccuracyORM/mean": 0.3083333395421505, "rewards/MultiModalAccuracyORM/std": 0.34488060176372526, "step": 1470, "train_speed(iter/s)": 0.03204 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 772.7, "completions/mean_length": 466.6416809082031, "completions/min_length": 251.3, "epoch": 0.5959595959595959, "grad_norm": 1.842366669706851, "kl": 0.01016082763671875, "learning_rate": 2e-07, "loss": 0.015132546424865723, "memory(GiB)": 113.5, "reward": 0.22500001043081283, "reward_std": 0.3044206529855728, "rewards/MultiModalAccuracyORM/mean": 0.22500001043081283, "rewards/MultiModalAccuracyORM/std": 0.3044206529855728, "step": 1475, "train_speed(iter/s)": 0.032046 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.7, "completions/mean_length": 344.0333480834961, "completions/min_length": 205.0, "epoch": 0.597979797979798, "grad_norm": 0.07710895823869458, "kl": 0.01250762939453125, "learning_rate": 2e-07, "loss": 0.02509859800338745, "memory(GiB)": 113.5, "reward": 0.47500001192092894, "reward_std": 0.2752393215894699, "rewards/MultiModalAccuracyORM/mean": 0.47500001192092894, "rewards/MultiModalAccuracyORM/std": 0.2752393215894699, "step": 1480, "train_speed(iter/s)": 0.032062 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.5, "completions/mean_length": 306.80834197998047, "completions/min_length": 174.4, "epoch": 0.6, "grad_norm": 0.084452934933302, "kl": 0.0158172607421875, "learning_rate": 2e-07, "loss": -0.027300435304641723, "memory(GiB)": 113.5, "reward": 0.17500000521540643, "reward_std": 0.24105713069438933, "rewards/MultiModalAccuracyORM/mean": 0.17500000521540643, "rewards/MultiModalAccuracyORM/std": 0.24105713069438933, "step": 1485, "train_speed(iter/s)": 0.032084 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.7, "completions/mean_length": 324.4833389282227, "completions/min_length": 169.9, "epoch": 0.602020202020202, "grad_norm": 1.3165133966084028, "kl": 0.0114501953125, "learning_rate": 2e-07, "loss": 0.004012265801429748, "memory(GiB)": 113.5, "reward": 0.3916666753590107, "reward_std": 0.31046818792819975, "rewards/MultiModalAccuracyORM/mean": 0.3916666753590107, "rewards/MultiModalAccuracyORM/std": 0.31046818792819975, "step": 1490, "train_speed(iter/s)": 0.032103 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.7, "completions/mean_length": 336.21668243408203, "completions/min_length": 202.5, "epoch": 0.604040404040404, "grad_norm": 3.938520632284254, "kl": 0.0132232666015625, "learning_rate": 2e-07, "loss": -0.02633047103881836, "memory(GiB)": 113.5, "reward": 0.3416666708886623, "reward_std": 0.3149157464504242, "rewards/MultiModalAccuracyORM/mean": 0.3416666708886623, "rewards/MultiModalAccuracyORM/std": 0.3149157464504242, "step": 1495, "train_speed(iter/s)": 0.032103 }, { "epoch": 0.6060606060606061, "grad_norm": 2.7010910619752164, "learning_rate": 2e-07, "loss": 0.023089283704757692, "memory(GiB)": 113.5, "step": 1500, "train_speed(iter/s)": 0.032112 }, { "epoch": 0.6060606060606061, "eval_clip_ratio": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 598.76, "eval_completions/mean_length": 375.5383447265625, "eval_completions/min_length": 218.18, "eval_kl": 0.00917266845703125, "eval_loss": -0.012349152937531471, "eval_reward": 0.32000000730156897, "eval_reward_std": 0.3092414766550064, "eval_rewards/MultiModalAccuracyORM/mean": 0.32000000730156897, "eval_rewards/MultiModalAccuracyORM/std": 0.3092414766550064, "eval_runtime": 601.161, "eval_samples_per_second": 0.083, "eval_steps_per_second": 0.008, "step": 1500 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 618.6, "completions/mean_length": 392.15001182556153, "completions/min_length": 216.0, "epoch": 0.6080808080808081, "grad_norm": 1.4655160488310728, "kl": 0.010688018798828126, "learning_rate": 2e-07, "loss": 0.00576329231262207, "memory(GiB)": 113.5, "reward": 0.40416667349636554, "reward_std": 0.31379757523536683, "rewards/MultiModalAccuracyORM/mean": 0.40416667349636554, "rewards/MultiModalAccuracyORM/std": 0.31379757523536683, "step": 1505, "train_speed(iter/s)": 0.031582 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 642.6, "completions/mean_length": 392.9166763305664, "completions/min_length": 185.9, "epoch": 0.6101010101010101, "grad_norm": 2.300152870135833, "kl": 0.0118072509765625, "learning_rate": 2e-07, "loss": 0.01058935523033142, "memory(GiB)": 113.5, "reward": 0.15833333656191825, "reward_std": 0.27622397541999816, "rewards/MultiModalAccuracyORM/mean": 0.15833333656191825, "rewards/MultiModalAccuracyORM/std": 0.27622397541999816, "step": 1510, "train_speed(iter/s)": 0.031594 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 668.5, "completions/mean_length": 401.3166809082031, "completions/min_length": 227.3, "epoch": 0.6121212121212121, "grad_norm": 2.0573660536714256, "kl": 0.01282958984375, "learning_rate": 2e-07, "loss": 0.028659382462501527, "memory(GiB)": 113.5, "reward": 0.27500000819563863, "reward_std": 0.3438218057155609, "rewards/MultiModalAccuracyORM/mean": 0.27500000819563863, "rewards/MultiModalAccuracyORM/std": 0.3438218057155609, "step": 1515, "train_speed(iter/s)": 0.031593 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 645.0, "completions/mean_length": 390.6333465576172, "completions/min_length": 240.3, "epoch": 0.6141414141414141, "grad_norm": 1.4644802229965364, "kl": 0.0115325927734375, "learning_rate": 2e-07, "loss": 0.009964641928672791, "memory(GiB)": 113.5, "reward": 0.20833334624767302, "reward_std": 0.25113856196403506, "rewards/MultiModalAccuracyORM/mean": 0.20833334624767302, "rewards/MultiModalAccuracyORM/std": 0.25113856196403506, "step": 1520, "train_speed(iter/s)": 0.031607 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.1, "completions/mean_length": 338.3666763305664, "completions/min_length": 187.9, "epoch": 0.6161616161616161, "grad_norm": 2.312953380739967, "kl": 0.011322021484375, "learning_rate": 2e-07, "loss": 0.0045973040163516995, "memory(GiB)": 113.5, "reward": 0.22500000521540642, "reward_std": 0.22224704921245575, "rewards/MultiModalAccuracyORM/mean": 0.22500000521540642, "rewards/MultiModalAccuracyORM/std": 0.22224704921245575, "step": 1525, "train_speed(iter/s)": 0.03163 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.6, "completions/mean_length": 368.7583465576172, "completions/min_length": 193.4, "epoch": 0.6181818181818182, "grad_norm": 3.0723153433233095, "kl": 0.0133697509765625, "learning_rate": 2e-07, "loss": -0.030410391092300416, "memory(GiB)": 113.5, "reward": 0.25000000819563867, "reward_std": 0.35340302884578706, "rewards/MultiModalAccuracyORM/mean": 0.25000000819563867, "rewards/MultiModalAccuracyORM/std": 0.35340302884578706, "step": 1530, "train_speed(iter/s)": 0.03164 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.6, "completions/mean_length": 351.21667633056643, "completions/min_length": 227.9, "epoch": 0.6202020202020202, "grad_norm": 1.4538179467280616, "kl": 0.01126708984375, "learning_rate": 2e-07, "loss": 0.0038071274757385254, "memory(GiB)": 113.5, "reward": 0.31666667610406873, "reward_std": 0.27749558687210085, "rewards/MultiModalAccuracyORM/mean": 0.31666667610406873, "rewards/MultiModalAccuracyORM/std": 0.27749558687210085, "step": 1535, "train_speed(iter/s)": 0.031636 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.2, "completions/mean_length": 362.9000076293945, "completions/min_length": 196.3, "epoch": 0.6222222222222222, "grad_norm": 2.3834408729545817, "kl": 0.011865234375, "learning_rate": 2e-07, "loss": -0.007588768005371093, "memory(GiB)": 113.5, "reward": 0.2500000029802322, "reward_std": 0.2885732680559158, "rewards/MultiModalAccuracyORM/mean": 0.2500000029802322, "rewards/MultiModalAccuracyORM/std": 0.2885732680559158, "step": 1540, "train_speed(iter/s)": 0.031648 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 635.7, "completions/mean_length": 418.4583480834961, "completions/min_length": 246.6, "epoch": 0.6242424242424243, "grad_norm": 1.498638277562189, "kl": 0.0112030029296875, "learning_rate": 2e-07, "loss": 0.00476650595664978, "memory(GiB)": 113.5, "reward": 0.2583333387970924, "reward_std": 0.3297544777393341, "rewards/MultiModalAccuracyORM/mean": 0.2583333387970924, "rewards/MultiModalAccuracyORM/std": 0.3297544777393341, "step": 1545, "train_speed(iter/s)": 0.031653 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.1, "completions/mean_length": 338.0750076293945, "completions/min_length": 180.2, "epoch": 0.6262626262626263, "grad_norm": 1.3673556260797224, "kl": 0.012890625, "learning_rate": 2e-07, "loss": 0.011944988369941711, "memory(GiB)": 113.5, "reward": 0.2666666731238365, "reward_std": 0.36717758774757386, "rewards/MultiModalAccuracyORM/mean": 0.2666666731238365, "rewards/MultiModalAccuracyORM/std": 0.36717758774757386, "step": 1550, "train_speed(iter/s)": 0.031667 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/mean_length": 389.5916778564453, "completions/min_length": 242.8, "epoch": 0.6282828282828283, "grad_norm": 2.327729044871898, "kl": 0.014337158203125, "learning_rate": 2e-07, "loss": -0.015535221993923187, "memory(GiB)": 113.5, "reward": 0.17500000298023224, "reward_std": 0.3498097449541092, "rewards/MultiModalAccuracyORM/mean": 0.17500000298023224, "rewards/MultiModalAccuracyORM/std": 0.3498097449541092, "step": 1555, "train_speed(iter/s)": 0.031684 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/mean_length": 392.79168090820315, "completions/min_length": 227.0, "epoch": 0.6303030303030303, "grad_norm": 0.053806194925700226, "kl": 0.0107635498046875, "learning_rate": 2e-07, "loss": 0.017643353343009947, "memory(GiB)": 113.5, "reward": 0.19166667014360428, "reward_std": 0.3011411875486374, "rewards/MultiModalAccuracyORM/mean": 0.19166667014360428, "rewards/MultiModalAccuracyORM/std": 0.3011411875486374, "step": 1560, "train_speed(iter/s)": 0.031689 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.9, "completions/mean_length": 332.08333587646484, "completions/min_length": 183.7, "epoch": 0.6323232323232323, "grad_norm": 0.570186834834556, "kl": 0.016534423828125, "learning_rate": 2e-07, "loss": -0.02576545476913452, "memory(GiB)": 113.5, "reward": 0.3000000089406967, "reward_std": 0.3503421902656555, "rewards/MultiModalAccuracyORM/mean": 0.3000000089406967, "rewards/MultiModalAccuracyORM/std": 0.3503421902656555, "step": 1565, "train_speed(iter/s)": 0.031696 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 687.6, "completions/mean_length": 380.7333465576172, "completions/min_length": 206.4, "epoch": 0.6343434343434343, "grad_norm": 2.2565482508451735, "kl": 0.0091888427734375, "learning_rate": 2e-07, "loss": 0.01603304147720337, "memory(GiB)": 113.5, "reward": 0.2833333440124989, "reward_std": 0.3637146830558777, "rewards/MultiModalAccuracyORM/mean": 0.2833333440124989, "rewards/MultiModalAccuracyORM/std": 0.3637146830558777, "step": 1570, "train_speed(iter/s)": 0.031705 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.3, "completions/mean_length": 343.33334197998045, "completions/min_length": 181.6, "epoch": 0.6363636363636364, "grad_norm": 1.578397296268303, "kl": 0.013397216796875, "learning_rate": 2e-07, "loss": 0.04952932298183441, "memory(GiB)": 113.5, "reward": 0.4666666738688946, "reward_std": 0.37498117983341217, "rewards/MultiModalAccuracyORM/mean": 0.4666666738688946, "rewards/MultiModalAccuracyORM/std": 0.37498117983341217, "step": 1575, "train_speed(iter/s)": 0.031725 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.008333333333333333, "completions/max_length": 713.8, "completions/mean_length": 377.916682434082, "completions/min_length": 199.8, "epoch": 0.6383838383838384, "grad_norm": 1.458403513622403, "kl": 0.0122344970703125, "learning_rate": 2e-07, "loss": 0.016104981303215027, "memory(GiB)": 113.5, "reward": 0.4833333417773247, "reward_std": 0.3252659499645233, "rewards/MultiModalAccuracyORM/mean": 0.4833333417773247, "rewards/MultiModalAccuracyORM/std": 0.3252659499645233, "step": 1580, "train_speed(iter/s)": 0.031726 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.008333333333333333, "completions/max_length": 678.8, "completions/mean_length": 418.26668243408204, "completions/min_length": 202.3, "epoch": 0.6404040404040404, "grad_norm": 1.8686390380230793, "kl": 0.013201904296875, "learning_rate": 2e-07, "loss": 0.011665409803390503, "memory(GiB)": 113.5, "reward": 0.3000000096857548, "reward_std": 0.2652414858341217, "rewards/MultiModalAccuracyORM/mean": 0.3000000096857548, "rewards/MultiModalAccuracyORM/std": 0.2652414858341217, "step": 1585, "train_speed(iter/s)": 0.031728 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 723.2, "completions/mean_length": 388.46668395996096, "completions/min_length": 206.4, "epoch": 0.6424242424242425, "grad_norm": 1.992138254292841, "kl": 0.011810302734375, "learning_rate": 2e-07, "loss": 0.08419913649559022, "memory(GiB)": 113.5, "reward": 0.2916666746139526, "reward_std": 0.4093579977750778, "rewards/MultiModalAccuracyORM/mean": 0.2916666746139526, "rewards/MultiModalAccuracyORM/std": 0.4093579977750778, "step": 1590, "train_speed(iter/s)": 0.031721 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.3, "completions/mean_length": 303.75001220703126, "completions/min_length": 176.1, "epoch": 0.6444444444444445, "grad_norm": 1.636979804109864, "kl": 0.0161895751953125, "learning_rate": 2e-07, "loss": 0.011809319257736206, "memory(GiB)": 113.5, "reward": 0.3333333425223827, "reward_std": 0.25897532403469087, "rewards/MultiModalAccuracyORM/mean": 0.3333333425223827, "rewards/MultiModalAccuracyORM/std": 0.25897532403469087, "step": 1595, "train_speed(iter/s)": 0.031748 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.9, "completions/mean_length": 333.1666763305664, "completions/min_length": 160.1, "epoch": 0.6464646464646465, "grad_norm": 2.2869229330092393, "kl": 0.01339111328125, "learning_rate": 2e-07, "loss": 0.005678671598434448, "memory(GiB)": 113.5, "reward": 0.3416666753590107, "reward_std": 0.3189666152000427, "rewards/MultiModalAccuracyORM/mean": 0.3416666753590107, "rewards/MultiModalAccuracyORM/std": 0.3189666152000427, "step": 1600, "train_speed(iter/s)": 0.031761 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.7, "completions/mean_length": 279.3166778564453, "completions/min_length": 174.7, "epoch": 0.6484848484848484, "grad_norm": 1.3720767401777028, "kl": 0.014947509765625, "learning_rate": 2e-07, "loss": 0.0007772698998451232, "memory(GiB)": 113.5, "reward": 0.2916666753590107, "reward_std": 0.29786467254161836, "rewards/MultiModalAccuracyORM/mean": 0.2916666753590107, "rewards/MultiModalAccuracyORM/std": 0.29786467254161836, "step": 1605, "train_speed(iter/s)": 0.031778 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.7, "completions/mean_length": 286.30834197998047, "completions/min_length": 167.6, "epoch": 0.6505050505050505, "grad_norm": 2.272498565917859, "kl": 0.0147857666015625, "learning_rate": 2e-07, "loss": 0.03825833797454834, "memory(GiB)": 113.5, "reward": 0.30833333656191825, "reward_std": 0.3430673748254776, "rewards/MultiModalAccuracyORM/mean": 0.30833333656191825, "rewards/MultiModalAccuracyORM/std": 0.3430673748254776, "step": 1610, "train_speed(iter/s)": 0.031791 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.7, "completions/mean_length": 305.6833435058594, "completions/min_length": 176.9, "epoch": 0.6525252525252525, "grad_norm": 2.101651221741828, "kl": 0.01689453125, "learning_rate": 2e-07, "loss": -0.010073482990264893, "memory(GiB)": 113.5, "reward": 0.40833334550261496, "reward_std": 0.3845028102397919, "rewards/MultiModalAccuracyORM/mean": 0.40833334550261496, "rewards/MultiModalAccuracyORM/std": 0.3845028102397919, "step": 1615, "train_speed(iter/s)": 0.031812 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.2, "completions/mean_length": 355.3666732788086, "completions/min_length": 217.0, "epoch": 0.6545454545454545, "grad_norm": 1.7437833008639363, "kl": 0.014605712890625, "learning_rate": 2e-07, "loss": 0.03341163992881775, "memory(GiB)": 113.5, "reward": 0.3083333432674408, "reward_std": 0.3104085922241211, "rewards/MultiModalAccuracyORM/mean": 0.3083333432674408, "rewards/MultiModalAccuracyORM/std": 0.3104085922241211, "step": 1620, "train_speed(iter/s)": 0.031815 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.7, "completions/mean_length": 320.88334197998046, "completions/min_length": 176.8, "epoch": 0.6565656565656566, "grad_norm": 2.214426657751653, "kl": 0.012939453125, "learning_rate": 2e-07, "loss": 0.0038519926369190217, "memory(GiB)": 113.5, "reward": 0.4500000111758709, "reward_std": 0.3840597689151764, "rewards/MultiModalAccuracyORM/mean": 0.4500000111758709, "rewards/MultiModalAccuracyORM/std": 0.3840597689151764, "step": 1625, "train_speed(iter/s)": 0.031834 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.5, "completions/mean_length": 307.9750091552734, "completions/min_length": 179.6, "epoch": 0.6585858585858586, "grad_norm": 2.3559044349874965, "kl": 0.011468505859375, "learning_rate": 2e-07, "loss": -0.007926353812217712, "memory(GiB)": 113.5, "reward": 0.3500000089406967, "reward_std": 0.21594529151916503, "rewards/MultiModalAccuracyORM/mean": 0.3500000089406967, "rewards/MultiModalAccuracyORM/std": 0.21594529151916503, "step": 1630, "train_speed(iter/s)": 0.031861 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/mean_length": 272.0000061035156, "completions/min_length": 140.0, "epoch": 0.6606060606060606, "grad_norm": 2.3218216739931163, "kl": 0.01510009765625, "learning_rate": 2e-07, "loss": -0.017690959572792053, "memory(GiB)": 113.5, "reward": 0.28333333805203437, "reward_std": 0.20416739881038665, "rewards/MultiModalAccuracyORM/mean": 0.28333333805203437, "rewards/MultiModalAccuracyORM/std": 0.20416739881038665, "step": 1635, "train_speed(iter/s)": 0.03188 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/mean_length": 346.666682434082, "completions/min_length": 213.6, "epoch": 0.6626262626262627, "grad_norm": 1.7521312796960462, "kl": 0.011456298828125, "learning_rate": 2e-07, "loss": -0.01213396042585373, "memory(GiB)": 113.5, "reward": 0.34166667237877846, "reward_std": 0.2464074045419693, "rewards/MultiModalAccuracyORM/mean": 0.34166667237877846, "rewards/MultiModalAccuracyORM/std": 0.2464074045419693, "step": 1640, "train_speed(iter/s)": 0.031896 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/mean_length": 235.60834426879882, "completions/min_length": 120.5, "epoch": 0.6646464646464646, "grad_norm": 2.846675522202014, "kl": 0.0129638671875, "learning_rate": 2e-07, "loss": -0.01681770384311676, "memory(GiB)": 113.5, "reward": 0.4000000089406967, "reward_std": 0.364131298661232, "rewards/MultiModalAccuracyORM/mean": 0.4000000089406967, "rewards/MultiModalAccuracyORM/std": 0.364131298661232, "step": 1645, "train_speed(iter/s)": 0.031917 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 670.8, "completions/mean_length": 355.72500762939455, "completions/min_length": 171.8, "epoch": 0.6666666666666666, "grad_norm": 2.5063086447109546, "kl": 0.01486053466796875, "learning_rate": 2e-07, "loss": 0.005304119735956192, "memory(GiB)": 113.5, "reward": 0.3500000134110451, "reward_std": 0.41141627728939056, "rewards/MultiModalAccuracyORM/mean": 0.3500000134110451, "rewards/MultiModalAccuracyORM/std": 0.41141627728939056, "step": 1650, "train_speed(iter/s)": 0.031924 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 733.8, "completions/mean_length": 412.1666793823242, "completions/min_length": 210.8, "epoch": 0.6686868686868687, "grad_norm": 2.9814971352286297, "kl": 0.018035888671875, "learning_rate": 2e-07, "loss": 0.0013743340969085693, "memory(GiB)": 113.5, "reward": 0.2083333373069763, "reward_std": 0.28402756750583646, "rewards/MultiModalAccuracyORM/mean": 0.2083333373069763, "rewards/MultiModalAccuracyORM/std": 0.28402756750583646, "step": 1655, "train_speed(iter/s)": 0.031912 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.7, "completions/mean_length": 315.45000915527345, "completions/min_length": 164.2, "epoch": 0.6707070707070707, "grad_norm": 2.420560710043236, "kl": 0.013409423828125, "learning_rate": 2e-07, "loss": -0.0018982872366905212, "memory(GiB)": 113.5, "reward": 0.20000000521540642, "reward_std": 0.25270916223526, "rewards/MultiModalAccuracyORM/mean": 0.20000000521540642, "rewards/MultiModalAccuracyORM/std": 0.25270916223526, "step": 1660, "train_speed(iter/s)": 0.031926 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.3, "completions/mean_length": 373.15834045410156, "completions/min_length": 187.0, "epoch": 0.6727272727272727, "grad_norm": 2.099245716082938, "kl": 0.0146942138671875, "learning_rate": 2e-07, "loss": 0.0194022536277771, "memory(GiB)": 113.5, "reward": 0.24166667237877845, "reward_std": 0.29383077621459963, "rewards/MultiModalAccuracyORM/mean": 0.24166667237877845, "rewards/MultiModalAccuracyORM/std": 0.29383077621459963, "step": 1665, "train_speed(iter/s)": 0.031939 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.1, "completions/mean_length": 409.43335113525393, "completions/min_length": 252.2, "epoch": 0.6747474747474748, "grad_norm": 0.8827203782530715, "kl": 0.01336669921875, "learning_rate": 2e-07, "loss": 0.022216227650642396, "memory(GiB)": 113.5, "reward": 0.2833333410322666, "reward_std": 0.23704480826854707, "rewards/MultiModalAccuracyORM/mean": 0.2833333410322666, "rewards/MultiModalAccuracyORM/std": 0.23704480826854707, "step": 1670, "train_speed(iter/s)": 0.031942 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.7, "completions/mean_length": 372.35001068115236, "completions/min_length": 223.6, "epoch": 0.6767676767676768, "grad_norm": 2.67307804927538, "kl": 0.012176513671875, "learning_rate": 2e-07, "loss": -0.025462892651557923, "memory(GiB)": 113.5, "reward": 0.3583333410322666, "reward_std": 0.30489686131477356, "rewards/MultiModalAccuracyORM/mean": 0.3583333410322666, "rewards/MultiModalAccuracyORM/std": 0.30489686131477356, "step": 1675, "train_speed(iter/s)": 0.031947 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.9, "completions/mean_length": 252.71667709350587, "completions/min_length": 134.5, "epoch": 0.6787878787878788, "grad_norm": 2.948416033259282, "kl": 0.013824462890625, "learning_rate": 2e-07, "loss": 0.007326580584049225, "memory(GiB)": 113.5, "reward": 0.5083333514630795, "reward_std": 0.3945842385292053, "rewards/MultiModalAccuracyORM/mean": 0.5083333514630795, "rewards/MultiModalAccuracyORM/std": 0.3945842385292053, "step": 1680, "train_speed(iter/s)": 0.031971 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.6, "completions/mean_length": 367.6166732788086, "completions/min_length": 225.0, "epoch": 0.6808080808080809, "grad_norm": 0.07197046485321759, "kl": 0.0118194580078125, "learning_rate": 2e-07, "loss": 0.03796108365058899, "memory(GiB)": 113.5, "reward": 0.23333333879709245, "reward_std": 0.20995735228061677, "rewards/MultiModalAccuracyORM/mean": 0.23333333879709245, "rewards/MultiModalAccuracyORM/std": 0.20995735228061677, "step": 1685, "train_speed(iter/s)": 0.031972 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/mean_length": 348.7500061035156, "completions/min_length": 212.9, "epoch": 0.6828282828282828, "grad_norm": 1.5442560082143544, "kl": 0.01568603515625, "learning_rate": 2e-07, "loss": 0.017047417163848878, "memory(GiB)": 113.5, "reward": 0.358333345502615, "reward_std": 0.4405413746833801, "rewards/MultiModalAccuracyORM/mean": 0.358333345502615, "rewards/MultiModalAccuracyORM/std": 0.4405413746833801, "step": 1690, "train_speed(iter/s)": 0.031977 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.5, "completions/mean_length": 318.30000228881835, "completions/min_length": 162.4, "epoch": 0.6848484848484848, "grad_norm": 2.822151558666746, "kl": 0.013775634765625, "learning_rate": 2e-07, "loss": 0.03140446245670318, "memory(GiB)": 113.5, "reward": 0.3583333469927311, "reward_std": 0.399324569106102, "rewards/MultiModalAccuracyORM/mean": 0.3583333469927311, "rewards/MultiModalAccuracyORM/std": 0.399324569106102, "step": 1695, "train_speed(iter/s)": 0.031999 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 743.9, "completions/mean_length": 485.1916870117187, "completions/min_length": 305.5, "epoch": 0.6868686868686869, "grad_norm": 0.9869928398468556, "kl": 0.0103057861328125, "learning_rate": 2e-07, "loss": 0.018257686495780946, "memory(GiB)": 113.5, "reward": 0.2583333417773247, "reward_std": 0.29035089910030365, "rewards/MultiModalAccuracyORM/mean": 0.2583333417773247, "rewards/MultiModalAccuracyORM/std": 0.29035089910030365, "step": 1700, "train_speed(iter/s)": 0.031993 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.9, "completions/mean_length": 329.5166725158691, "completions/min_length": 196.6, "epoch": 0.6888888888888889, "grad_norm": 2.390331116834798, "kl": 0.0239990234375, "learning_rate": 2e-07, "loss": -0.02088260054588318, "memory(GiB)": 113.5, "reward": 0.4750000089406967, "reward_std": 0.27753118276596067, "rewards/MultiModalAccuracyORM/mean": 0.4750000089406967, "rewards/MultiModalAccuracyORM/std": 0.27753118276596067, "step": 1705, "train_speed(iter/s)": 0.032012 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.8, "completions/mean_length": 286.5166694641113, "completions/min_length": 153.0, "epoch": 0.6909090909090909, "grad_norm": 3.070912031712293, "kl": 0.0171630859375, "learning_rate": 2e-07, "loss": 0.00493430495262146, "memory(GiB)": 113.5, "reward": 0.45833334028720857, "reward_std": 0.31192905008792876, "rewards/MultiModalAccuracyORM/mean": 0.45833334028720857, "rewards/MultiModalAccuracyORM/std": 0.31192905008792876, "step": 1710, "train_speed(iter/s)": 0.032022 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.9, "completions/mean_length": 243.5666763305664, "completions/min_length": 117.3, "epoch": 0.692929292929293, "grad_norm": 2.7698758058054214, "kl": 0.0126708984375, "learning_rate": 2e-07, "loss": -0.0016166016459465027, "memory(GiB)": 113.5, "reward": 0.42500000819563866, "reward_std": 0.25512445867061617, "rewards/MultiModalAccuracyORM/mean": 0.42500000819563866, "rewards/MultiModalAccuracyORM/std": 0.25512445867061617, "step": 1715, "train_speed(iter/s)": 0.032047 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.2, "completions/mean_length": 255.48334503173828, "completions/min_length": 158.4, "epoch": 0.694949494949495, "grad_norm": 2.744690041316947, "kl": 0.015838623046875, "learning_rate": 2e-07, "loss": -0.019546210765838623, "memory(GiB)": 113.5, "reward": 0.26666667833924296, "reward_std": 0.2754935443401337, "rewards/MultiModalAccuracyORM/mean": 0.26666667833924296, "rewards/MultiModalAccuracyORM/std": 0.2754935443401337, "step": 1720, "train_speed(iter/s)": 0.032071 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.9, "completions/mean_length": 379.2666839599609, "completions/min_length": 221.8, "epoch": 0.696969696969697, "grad_norm": 1.777431935717832, "kl": 0.012689208984375, "learning_rate": 2e-07, "loss": 0.009233607351779938, "memory(GiB)": 113.5, "reward": 0.3083333395421505, "reward_std": 0.28128685653209684, "rewards/MultiModalAccuracyORM/mean": 0.3083333395421505, "rewards/MultiModalAccuracyORM/std": 0.28128685653209684, "step": 1725, "train_speed(iter/s)": 0.032079 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 641.0, "completions/mean_length": 354.1666770935059, "completions/min_length": 171.8, "epoch": 0.6989898989898989, "grad_norm": 3.0383086202130616, "kl": 0.01773681640625, "learning_rate": 2e-07, "loss": 0.03813132643699646, "memory(GiB)": 113.5, "reward": 0.31666666865348814, "reward_std": 0.27938000559806825, "rewards/MultiModalAccuracyORM/mean": 0.31666666865348814, "rewards/MultiModalAccuracyORM/std": 0.27938000559806825, "step": 1730, "train_speed(iter/s)": 0.032094 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.8, "completions/mean_length": 351.7666717529297, "completions/min_length": 157.5, "epoch": 0.701010101010101, "grad_norm": 1.8778050454869868, "kl": 0.014495849609375, "learning_rate": 2e-07, "loss": 0.00038725733757019045, "memory(GiB)": 113.5, "reward": 0.4833333432674408, "reward_std": 0.33153211176395414, "rewards/MultiModalAccuracyORM/mean": 0.4833333432674408, "rewards/MultiModalAccuracyORM/std": 0.33153211176395414, "step": 1735, "train_speed(iter/s)": 0.032098 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.2, "completions/mean_length": 350.0083404541016, "completions/min_length": 189.6, "epoch": 0.703030303030303, "grad_norm": 2.1562095065119053, "kl": 0.0187957763671875, "learning_rate": 2e-07, "loss": -0.02958904504776001, "memory(GiB)": 113.5, "reward": 0.28333333879709244, "reward_std": 0.3487591862678528, "rewards/MultiModalAccuracyORM/mean": 0.28333333879709244, "rewards/MultiModalAccuracyORM/std": 0.3487591862678528, "step": 1740, "train_speed(iter/s)": 0.032101 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.2, "completions/mean_length": 310.36668090820314, "completions/min_length": 187.0, "epoch": 0.705050505050505, "grad_norm": 2.8598050443718797, "kl": 0.013079833984375, "learning_rate": 2e-07, "loss": -0.007939225435256958, "memory(GiB)": 113.5, "reward": 0.35000000819563865, "reward_std": 0.3908045649528503, "rewards/MultiModalAccuracyORM/mean": 0.35000000819563865, "rewards/MultiModalAccuracyORM/std": 0.3908045649528503, "step": 1745, "train_speed(iter/s)": 0.032116 }, { "epoch": 0.7070707070707071, "grad_norm": 2.0976510908729256, "learning_rate": 2e-07, "loss": 0.05007731318473816, "memory(GiB)": 113.5, "step": 1750, "train_speed(iter/s)": 0.03212 }, { "epoch": 0.7070707070707071, "eval_clip_ratio": 0.0, "eval_completions/clipped_ratio": 0.0016666666666666666, "eval_completions/max_length": 587.9, "eval_completions/mean_length": 354.56501251220703, "eval_completions/min_length": 214.28, "eval_kl": 0.01150848388671875, "eval_loss": 0.0095694400370121, "eval_reward": 0.3250000074505806, "eval_reward_std": 0.32090782165527343, "eval_rewards/MultiModalAccuracyORM/mean": 0.3250000074505806, "eval_rewards/MultiModalAccuracyORM/std": 0.32090782165527343, "eval_runtime": 581.3868, "eval_samples_per_second": 0.086, "eval_steps_per_second": 0.009, "step": 1750 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.3, "completions/mean_length": 361.1250114440918, "completions/min_length": 205.05, "epoch": 0.7090909090909091, "grad_norm": 1.92993215123609, "kl": 0.01456298828125, "learning_rate": 2e-07, "loss": 0.013172458112239837, "memory(GiB)": 113.5, "reward": 0.3041666720062494, "reward_std": 0.3517512962222099, "rewards/MultiModalAccuracyORM/mean": 0.3041666720062494, "rewards/MultiModalAccuracyORM/std": 0.3517512962222099, "step": 1755, "train_speed(iter/s)": 0.031659 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.016666666666666666, "completions/max_length": 877.0, "completions/mean_length": 508.4500274658203, "completions/min_length": 241.1, "epoch": 0.7111111111111111, "grad_norm": 1.960501031799004, "kl": 0.01246490478515625, "learning_rate": 2e-07, "loss": -0.01800227165222168, "memory(GiB)": 113.5, "reward": 0.24166667461395264, "reward_std": 0.40063177347183226, "rewards/MultiModalAccuracyORM/mean": 0.24166667461395264, "rewards/MultiModalAccuracyORM/std": 0.40063177347183226, "step": 1760, "train_speed(iter/s)": 0.031653 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 679.9, "completions/mean_length": 389.05833740234374, "completions/min_length": 194.6, "epoch": 0.7131313131313132, "grad_norm": 0.06856184885436768, "kl": 0.0163360595703125, "learning_rate": 2e-07, "loss": 0.05879574418067932, "memory(GiB)": 113.5, "reward": 0.45000000223517417, "reward_std": 0.26600751280784607, "rewards/MultiModalAccuracyORM/mean": 0.45000000223517417, "rewards/MultiModalAccuracyORM/std": 0.26600751280784607, "step": 1765, "train_speed(iter/s)": 0.031655 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.8, "completions/mean_length": 364.608349609375, "completions/min_length": 206.9, "epoch": 0.7151515151515152, "grad_norm": 2.5267727464559195, "kl": 0.013262939453125, "learning_rate": 2e-07, "loss": -0.05543935298919678, "memory(GiB)": 113.5, "reward": 0.4583333507180214, "reward_std": 0.349611759185791, "rewards/MultiModalAccuracyORM/mean": 0.4583333507180214, "rewards/MultiModalAccuracyORM/std": 0.349611759185791, "step": 1770, "train_speed(iter/s)": 0.031665 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 620.2, "completions/mean_length": 349.9750091552734, "completions/min_length": 182.8, "epoch": 0.7171717171717171, "grad_norm": 1.8053530203955317, "kl": 0.016015625, "learning_rate": 2e-07, "loss": 0.003249824047088623, "memory(GiB)": 113.5, "reward": 0.21666667535901069, "reward_std": 0.36190145611763, "rewards/MultiModalAccuracyORM/mean": 0.21666667535901069, "rewards/MultiModalAccuracyORM/std": 0.36190145611763, "step": 1775, "train_speed(iter/s)": 0.031667 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.5, "completions/mean_length": 257.55000534057615, "completions/min_length": 140.9, "epoch": 0.7191919191919192, "grad_norm": 2.81422482443103, "kl": 0.0193359375, "learning_rate": 2e-07, "loss": -0.02224818170070648, "memory(GiB)": 113.5, "reward": 0.30000000819563866, "reward_std": 0.3563897281885147, "rewards/MultiModalAccuracyORM/mean": 0.30000000819563866, "rewards/MultiModalAccuracyORM/std": 0.3563897281885147, "step": 1780, "train_speed(iter/s)": 0.031687 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.3, "completions/mean_length": 291.2166748046875, "completions/min_length": 164.3, "epoch": 0.7212121212121212, "grad_norm": 2.8650400951164525, "kl": 0.0154205322265625, "learning_rate": 2e-07, "loss": -0.02759958803653717, "memory(GiB)": 113.5, "reward": 0.21666667014360427, "reward_std": 0.2892681032419205, "rewards/MultiModalAccuracyORM/mean": 0.21666667014360427, "rewards/MultiModalAccuracyORM/std": 0.2892681032419205, "step": 1785, "train_speed(iter/s)": 0.031703 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 651.8, "completions/mean_length": 382.2583404541016, "completions/min_length": 222.3, "epoch": 0.7232323232323232, "grad_norm": 0.04924378999535635, "kl": 0.0165863037109375, "learning_rate": 2e-07, "loss": 0.002944570779800415, "memory(GiB)": 113.5, "reward": 0.1833333395421505, "reward_std": 0.3059200614690781, "rewards/MultiModalAccuracyORM/mean": 0.1833333395421505, "rewards/MultiModalAccuracyORM/std": 0.3059200614690781, "step": 1790, "train_speed(iter/s)": 0.031714 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.8, "completions/mean_length": 361.8166778564453, "completions/min_length": 215.4, "epoch": 0.7252525252525253, "grad_norm": 1.0400490856507523, "kl": 0.015380859375, "learning_rate": 2e-07, "loss": 0.0032314777374267576, "memory(GiB)": 113.5, "reward": 0.21666667312383653, "reward_std": 0.3141998678445816, "rewards/MultiModalAccuracyORM/mean": 0.21666667312383653, "rewards/MultiModalAccuracyORM/std": 0.3141998678445816, "step": 1795, "train_speed(iter/s)": 0.031727 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.5, "completions/mean_length": 366.5666793823242, "completions/min_length": 172.7, "epoch": 0.7272727272727273, "grad_norm": 1.579739169824459, "kl": 0.015093994140625, "learning_rate": 2e-07, "loss": -0.01905302405357361, "memory(GiB)": 113.5, "reward": 0.3166666753590107, "reward_std": 0.320466023683548, "rewards/MultiModalAccuracyORM/mean": 0.3166666753590107, "rewards/MultiModalAccuracyORM/std": 0.320466023683548, "step": 1800, "train_speed(iter/s)": 0.031739 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.7, "completions/mean_length": 319.3916732788086, "completions/min_length": 186.6, "epoch": 0.7292929292929293, "grad_norm": 3.0687917767271022, "kl": 0.0137054443359375, "learning_rate": 2e-07, "loss": 0.02089669108390808, "memory(GiB)": 113.5, "reward": 0.3666666693985462, "reward_std": 0.47085520029067995, "rewards/MultiModalAccuracyORM/mean": 0.3666666693985462, "rewards/MultiModalAccuracyORM/std": 0.47085520029067995, "step": 1805, "train_speed(iter/s)": 0.031757 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.7, "completions/mean_length": 375.0416778564453, "completions/min_length": 215.7, "epoch": 0.7313131313131314, "grad_norm": 1.8933797302813846, "kl": 0.0150848388671875, "learning_rate": 2e-07, "loss": 0.03909637928009033, "memory(GiB)": 113.5, "reward": 0.39166667610406875, "reward_std": 0.34688264429569243, "rewards/MultiModalAccuracyORM/mean": 0.39166667610406875, "rewards/MultiModalAccuracyORM/std": 0.34688264429569243, "step": 1810, "train_speed(iter/s)": 0.031762 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.9, "completions/mean_length": 305.2583404541016, "completions/min_length": 157.4, "epoch": 0.7333333333333333, "grad_norm": 1.8206918881783931, "kl": 0.0149169921875, "learning_rate": 2e-07, "loss": -0.016247293353080748, "memory(GiB)": 113.5, "reward": 0.2666666701436043, "reward_std": 0.32451151609420775, "rewards/MultiModalAccuracyORM/mean": 0.2666666701436043, "rewards/MultiModalAccuracyORM/std": 0.32451151609420775, "step": 1815, "train_speed(iter/s)": 0.031772 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.8, "completions/mean_length": 322.85000762939455, "completions/min_length": 183.5, "epoch": 0.7353535353535353, "grad_norm": 1.918825325754338, "kl": 0.01243896484375, "learning_rate": 2e-07, "loss": 0.010172617435455323, "memory(GiB)": 113.5, "reward": 0.20000000223517417, "reward_std": 0.21999078392982482, "rewards/MultiModalAccuracyORM/mean": 0.20000000223517417, "rewards/MultiModalAccuracyORM/std": 0.21999078392982482, "step": 1820, "train_speed(iter/s)": 0.031781 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.5, "completions/mean_length": 253.28334197998046, "completions/min_length": 136.0, "epoch": 0.7373737373737373, "grad_norm": 2.5646468814628482, "kl": 0.01630859375, "learning_rate": 2e-07, "loss": 0.08878597021102905, "memory(GiB)": 113.5, "reward": 0.4000000089406967, "reward_std": 0.3767348140478134, "rewards/MultiModalAccuracyORM/mean": 0.4000000089406967, "rewards/MultiModalAccuracyORM/std": 0.3767348140478134, "step": 1825, "train_speed(iter/s)": 0.031805 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.008333333333333333, "completions/max_length": 786.5, "completions/mean_length": 391.63334197998046, "completions/min_length": 175.8, "epoch": 0.7393939393939394, "grad_norm": 2.279597838394587, "kl": 0.016058349609375, "learning_rate": 2e-07, "loss": -0.01255677342414856, "memory(GiB)": 113.5, "reward": 0.33333333656191827, "reward_std": 0.30187161862850187, "rewards/MultiModalAccuracyORM/mean": 0.33333333656191827, "rewards/MultiModalAccuracyORM/std": 0.30187161862850187, "step": 1830, "train_speed(iter/s)": 0.031799 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/mean_length": 351.30834045410154, "completions/min_length": 184.7, "epoch": 0.7414141414141414, "grad_norm": 1.220249950163537, "kl": 0.014288330078125, "learning_rate": 2e-07, "loss": -0.03182802200317383, "memory(GiB)": 113.5, "reward": 0.4250000067055225, "reward_std": 0.40566191971302035, "rewards/MultiModalAccuracyORM/mean": 0.4250000067055225, "rewards/MultiModalAccuracyORM/std": 0.40566191971302035, "step": 1835, "train_speed(iter/s)": 0.031813 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.3, "completions/mean_length": 289.9583404541016, "completions/min_length": 139.6, "epoch": 0.7434343434343434, "grad_norm": 3.7311094209711153, "kl": 0.019146728515625, "learning_rate": 2e-07, "loss": -0.02434406876564026, "memory(GiB)": 113.5, "reward": 0.4333333432674408, "reward_std": 0.3922538310289383, "rewards/MultiModalAccuracyORM/mean": 0.4333333432674408, "rewards/MultiModalAccuracyORM/std": 0.3922538310289383, "step": 1840, "train_speed(iter/s)": 0.031824 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.3, "completions/mean_length": 292.6333404541016, "completions/min_length": 165.6, "epoch": 0.7454545454545455, "grad_norm": 2.2491131974096503, "kl": 0.018963623046875, "learning_rate": 2e-07, "loss": -0.029304242134094237, "memory(GiB)": 113.5, "reward": 0.4250000096857548, "reward_std": 0.3370794355869293, "rewards/MultiModalAccuracyORM/mean": 0.4250000096857548, "rewards/MultiModalAccuracyORM/std": 0.3370794355869293, "step": 1845, "train_speed(iter/s)": 0.031836 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.7, "completions/mean_length": 369.1416778564453, "completions/min_length": 209.3, "epoch": 0.7474747474747475, "grad_norm": 1.8956894287229566, "kl": 0.015484619140625, "learning_rate": 2e-07, "loss": 0.015110939741134644, "memory(GiB)": 113.5, "reward": 0.2916666716337204, "reward_std": 0.4038462698459625, "rewards/MultiModalAccuracyORM/mean": 0.2916666716337204, "rewards/MultiModalAccuracyORM/std": 0.4038462698459625, "step": 1850, "train_speed(iter/s)": 0.031843 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/mean_length": 357.9083419799805, "completions/min_length": 162.4, "epoch": 0.7494949494949495, "grad_norm": 2.5484409209581504, "kl": 0.0146087646484375, "learning_rate": 2e-07, "loss": -0.023239874839782716, "memory(GiB)": 113.5, "reward": 0.20000000894069672, "reward_std": 0.31517534554004667, "rewards/MultiModalAccuracyORM/mean": 0.20000000894069672, "rewards/MultiModalAccuracyORM/std": 0.31517534554004667, "step": 1855, "train_speed(iter/s)": 0.031855 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.3, "completions/mean_length": 354.6916748046875, "completions/min_length": 208.8, "epoch": 0.7515151515151515, "grad_norm": 2.0846151526365655, "kl": 0.0114715576171875, "learning_rate": 2e-07, "loss": 0.0073637284338474275, "memory(GiB)": 113.5, "reward": 0.18333333879709243, "reward_std": 0.2907939374446869, "rewards/MultiModalAccuracyORM/mean": 0.18333333879709243, "rewards/MultiModalAccuracyORM/std": 0.2907939374446869, "step": 1860, "train_speed(iter/s)": 0.031861 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.8, "completions/mean_length": 304.43334045410154, "completions/min_length": 155.7, "epoch": 0.7535353535353535, "grad_norm": 2.0624318263809047, "kl": 0.01436767578125, "learning_rate": 2e-07, "loss": 0.040461289882659915, "memory(GiB)": 113.5, "reward": 0.2250000059604645, "reward_std": 0.28959646821022034, "rewards/MultiModalAccuracyORM/mean": 0.2250000059604645, "rewards/MultiModalAccuracyORM/std": 0.28959646821022034, "step": 1865, "train_speed(iter/s)": 0.031882 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/mean_length": 348.55001678466795, "completions/min_length": 185.6, "epoch": 0.7555555555555555, "grad_norm": 0.07620984486729401, "kl": 0.02213134765625, "learning_rate": 2e-07, "loss": 0.014231646060943603, "memory(GiB)": 113.5, "reward": 0.1916666716337204, "reward_std": 0.23860623836517333, "rewards/MultiModalAccuracyORM/mean": 0.1916666716337204, "rewards/MultiModalAccuracyORM/std": 0.23860623836517333, "step": 1870, "train_speed(iter/s)": 0.031881 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.008333333333333333, "completions/max_length": 763.2, "completions/mean_length": 389.03334045410156, "completions/min_length": 191.9, "epoch": 0.7575757575757576, "grad_norm": 2.525300571346317, "kl": 0.02110443115234375, "learning_rate": 2e-07, "loss": 0.0036004871129989625, "memory(GiB)": 113.5, "reward": 0.2916666716337204, "reward_std": 0.41791602075099943, "rewards/MultiModalAccuracyORM/mean": 0.2916666716337204, "rewards/MultiModalAccuracyORM/std": 0.41791602075099943, "step": 1875, "train_speed(iter/s)": 0.03188 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.2, "completions/mean_length": 309.6250030517578, "completions/min_length": 170.7, "epoch": 0.7595959595959596, "grad_norm": 1.8476207975789374, "kl": 0.0132843017578125, "learning_rate": 2e-07, "loss": 0.01698073446750641, "memory(GiB)": 113.5, "reward": 0.1416666679084301, "reward_std": 0.24939410090446473, "rewards/MultiModalAccuracyORM/mean": 0.1416666679084301, "rewards/MultiModalAccuracyORM/std": 0.24939410090446473, "step": 1880, "train_speed(iter/s)": 0.031898 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.008333333333333333, "completions/max_length": 651.4, "completions/mean_length": 324.4583480834961, "completions/min_length": 173.3, "epoch": 0.7616161616161616, "grad_norm": 2.8918258139669333, "kl": 0.0152374267578125, "learning_rate": 2e-07, "loss": -0.01050989031791687, "memory(GiB)": 113.5, "reward": 0.3916666731238365, "reward_std": 0.3340185970067978, "rewards/MultiModalAccuracyORM/mean": 0.3916666731238365, "rewards/MultiModalAccuracyORM/std": 0.3340185970067978, "step": 1885, "train_speed(iter/s)": 0.031901 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 763.0, "completions/mean_length": 448.85834350585935, "completions/min_length": 173.1, "epoch": 0.7636363636363637, "grad_norm": 1.4150104336871425, "kl": 0.0143951416015625, "learning_rate": 2e-07, "loss": 0.013275668025016785, "memory(GiB)": 113.5, "reward": 0.2500000104308128, "reward_std": 0.33704383969306945, "rewards/MultiModalAccuracyORM/mean": 0.2500000104308128, "rewards/MultiModalAccuracyORM/std": 0.33704383969306945, "step": 1890, "train_speed(iter/s)": 0.031898 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.5, "completions/mean_length": 351.9833450317383, "completions/min_length": 186.2, "epoch": 0.7656565656565657, "grad_norm": 1.8175161534038837, "kl": 0.014886474609375, "learning_rate": 2e-07, "loss": -0.021983048319816588, "memory(GiB)": 113.5, "reward": 0.2750000089406967, "reward_std": 0.30795769989490507, "rewards/MultiModalAccuracyORM/mean": 0.2750000089406967, "rewards/MultiModalAccuracyORM/std": 0.30795769989490507, "step": 1895, "train_speed(iter/s)": 0.031911 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.1, "completions/mean_length": 327.0833404541016, "completions/min_length": 144.5, "epoch": 0.7676767676767676, "grad_norm": 2.5158628276606025, "kl": 0.0138824462890625, "learning_rate": 2e-07, "loss": 0.03910906314849853, "memory(GiB)": 113.5, "reward": 0.30833333656191825, "reward_std": 0.3422983974218369, "rewards/MultiModalAccuracyORM/mean": 0.30833333656191825, "rewards/MultiModalAccuracyORM/std": 0.3422983974218369, "step": 1900, "train_speed(iter/s)": 0.031926 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 696.3, "completions/mean_length": 446.5250076293945, "completions/min_length": 241.6, "epoch": 0.7696969696969697, "grad_norm": 2.2012277452389415, "kl": 0.026849365234375, "learning_rate": 2e-07, "loss": 0.0031028717756271364, "memory(GiB)": 113.5, "reward": 0.1916666716337204, "reward_std": 0.2526139706373215, "rewards/MultiModalAccuracyORM/mean": 0.1916666716337204, "rewards/MultiModalAccuracyORM/std": 0.2526139706373215, "step": 1905, "train_speed(iter/s)": 0.031932 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.5, "completions/mean_length": 332.84167633056643, "completions/min_length": 201.7, "epoch": 0.7717171717171717, "grad_norm": 1.9126113362129455, "kl": 0.0396942138671875, "learning_rate": 2e-07, "loss": -0.03872146010398865, "memory(GiB)": 113.5, "reward": 0.28333334252238274, "reward_std": 0.22631654143333435, "rewards/MultiModalAccuracyORM/mean": 0.28333334252238274, "rewards/MultiModalAccuracyORM/std": 0.22631654143333435, "step": 1910, "train_speed(iter/s)": 0.031948 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 675.2, "completions/mean_length": 324.3500061035156, "completions/min_length": 168.6, "epoch": 0.7737373737373737, "grad_norm": 3.0923030780646883, "kl": 0.0193115234375, "learning_rate": 2e-07, "loss": -0.00021869316697120667, "memory(GiB)": 113.5, "reward": 0.21666667312383653, "reward_std": 0.3495877593755722, "rewards/MultiModalAccuracyORM/mean": 0.21666667312383653, "rewards/MultiModalAccuracyORM/std": 0.3495877593755722, "step": 1915, "train_speed(iter/s)": 0.031953 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.008333333333333333, "completions/max_length": 851.2, "completions/mean_length": 429.6666732788086, "completions/min_length": 246.7, "epoch": 0.7757575757575758, "grad_norm": 1.4230705183827115, "kl": 0.0119659423828125, "learning_rate": 2e-07, "loss": -0.007732442766427994, "memory(GiB)": 113.5, "reward": 0.28333334177732467, "reward_std": 0.3922538310289383, "rewards/MultiModalAccuracyORM/mean": 0.28333334177732467, "rewards/MultiModalAccuracyORM/std": 0.3922538310289383, "step": 1920, "train_speed(iter/s)": 0.03195 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.1, "completions/mean_length": 327.52501220703124, "completions/min_length": 145.3, "epoch": 0.7777777777777778, "grad_norm": 2.492778960496138, "kl": 0.0209716796875, "learning_rate": 2e-07, "loss": 0.058314287662506105, "memory(GiB)": 113.5, "reward": 0.3416666746139526, "reward_std": 0.3370794355869293, "rewards/MultiModalAccuracyORM/mean": 0.3416666746139526, "rewards/MultiModalAccuracyORM/std": 0.3370794355869293, "step": 1925, "train_speed(iter/s)": 0.031957 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.2, "completions/mean_length": 314.4666748046875, "completions/min_length": 159.8, "epoch": 0.7797979797979798, "grad_norm": 1.3216256644694324, "kl": 0.019390869140625, "learning_rate": 2e-07, "loss": 0.003662779927253723, "memory(GiB)": 113.5, "reward": 0.40000000447034834, "reward_std": 0.19031869769096374, "rewards/MultiModalAccuracyORM/mean": 0.40000000447034834, "rewards/MultiModalAccuracyORM/std": 0.19031869769096374, "step": 1930, "train_speed(iter/s)": 0.031969 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.008333333333333333, "completions/max_length": 614.5, "completions/mean_length": 340.2666778564453, "completions/min_length": 194.2, "epoch": 0.7818181818181819, "grad_norm": 0.16139191599066427, "kl": 0.0214599609375, "learning_rate": 2e-07, "loss": -0.047375884652137754, "memory(GiB)": 113.5, "reward": 0.5166666835546494, "reward_std": 0.33453335165977477, "rewards/MultiModalAccuracyORM/mean": 0.5166666835546494, "rewards/MultiModalAccuracyORM/std": 0.33453335165977477, "step": 1935, "train_speed(iter/s)": 0.031969 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 647.0, "completions/mean_length": 385.9500137329102, "completions/min_length": 210.7, "epoch": 0.7838383838383839, "grad_norm": 1.6715004236392428, "kl": 0.0131805419921875, "learning_rate": 2e-07, "loss": -0.010814064741134643, "memory(GiB)": 113.5, "reward": 0.2250000037252903, "reward_std": 0.2325587034225464, "rewards/MultiModalAccuracyORM/mean": 0.2250000037252903, "rewards/MultiModalAccuracyORM/std": 0.2325587034225464, "step": 1940, "train_speed(iter/s)": 0.031974 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.008333333333333333, "completions/max_length": 803.8, "completions/mean_length": 441.1000091552734, "completions/min_length": 249.1, "epoch": 0.7858585858585858, "grad_norm": 1.1825903834954647, "kl": 0.0154388427734375, "learning_rate": 2e-07, "loss": 0.0033442020416259766, "memory(GiB)": 113.5, "reward": 0.20833333656191827, "reward_std": 0.2938903748989105, "rewards/MultiModalAccuracyORM/mean": 0.20833333656191827, "rewards/MultiModalAccuracyORM/std": 0.2938903748989105, "step": 1945, "train_speed(iter/s)": 0.031969 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.7, "completions/mean_length": 325.9666748046875, "completions/min_length": 175.2, "epoch": 0.7878787878787878, "grad_norm": 1.0389835461828303, "kl": 0.029302978515625, "learning_rate": 2e-07, "loss": 0.0020487613976001738, "memory(GiB)": 113.5, "reward": 0.31666667461395265, "reward_std": 0.2074468642473221, "rewards/MultiModalAccuracyORM/mean": 0.31666667461395265, "rewards/MultiModalAccuracyORM/std": 0.2074468642473221, "step": 1950, "train_speed(iter/s)": 0.031978 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.008333333333333333, "completions/max_length": 779.1, "completions/mean_length": 420.0750106811523, "completions/min_length": 233.8, "epoch": 0.7898989898989899, "grad_norm": 2.259966046846081, "kl": 0.0141693115234375, "learning_rate": 2e-07, "loss": 0.0017102479934692383, "memory(GiB)": 113.5, "reward": 0.09166666939854622, "reward_std": 0.18332210481166838, "rewards/MultiModalAccuracyORM/mean": 0.09166666939854622, "rewards/MultiModalAccuracyORM/std": 0.18332210481166838, "step": 1955, "train_speed(iter/s)": 0.031981 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.8, "completions/mean_length": 396.0333450317383, "completions/min_length": 206.8, "epoch": 0.7919191919191919, "grad_norm": 2.0736200850407713, "kl": 0.0152496337890625, "learning_rate": 2e-07, "loss": -0.005099079012870789, "memory(GiB)": 113.5, "reward": 0.5583333425223828, "reward_std": 0.28784283697605134, "rewards/MultiModalAccuracyORM/mean": 0.5583333425223828, "rewards/MultiModalAccuracyORM/std": 0.28784283697605134, "step": 1960, "train_speed(iter/s)": 0.031988 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.008333333333333333, "completions/max_length": 954.2, "completions/mean_length": 473.5583465576172, "completions/min_length": 247.3, "epoch": 0.793939393939394, "grad_norm": 2.157212917514597, "kl": 0.0150390625, "learning_rate": 2e-07, "loss": 0.024318861961364745, "memory(GiB)": 113.5, "reward": 0.21666667535901069, "reward_std": 0.36190145611763, "rewards/MultiModalAccuracyORM/mean": 0.21666667535901069, "rewards/MultiModalAccuracyORM/std": 0.36190145611763, "step": 1965, "train_speed(iter/s)": 0.031976 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 702.4, "completions/mean_length": 456.8000152587891, "completions/min_length": 269.1, "epoch": 0.795959595959596, "grad_norm": 2.346804141928421, "kl": 0.0154296875, "learning_rate": 2e-07, "loss": 0.011195459961891174, "memory(GiB)": 113.5, "reward": 0.1500000037252903, "reward_std": 0.25897532403469087, "rewards/MultiModalAccuracyORM/mean": 0.1500000037252903, "rewards/MultiModalAccuracyORM/std": 0.25897532403469087, "step": 1970, "train_speed(iter/s)": 0.031975 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 649.7, "completions/mean_length": 426.95000610351565, "completions/min_length": 288.6, "epoch": 0.797979797979798, "grad_norm": 1.937444109706918, "kl": 0.012738037109375, "learning_rate": 2e-07, "loss": 0.050849252939224245, "memory(GiB)": 113.5, "reward": 0.33333334401249887, "reward_std": 0.35569489002227783, "rewards/MultiModalAccuracyORM/mean": 0.33333334401249887, "rewards/MultiModalAccuracyORM/std": 0.35569489002227783, "step": 1975, "train_speed(iter/s)": 0.031976 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.9, "completions/mean_length": 308.3166732788086, "completions/min_length": 159.5, "epoch": 0.8, "grad_norm": 1.2310292555448101, "kl": 0.01746826171875, "learning_rate": 2e-07, "loss": 0.021820831298828124, "memory(GiB)": 113.5, "reward": 0.25000000149011614, "reward_std": 0.34010172784328463, "rewards/MultiModalAccuracyORM/mean": 0.25000000149011614, "rewards/MultiModalAccuracyORM/std": 0.34010172784328463, "step": 1980, "train_speed(iter/s)": 0.031988 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.3, "completions/mean_length": 397.74167861938474, "completions/min_length": 229.1, "epoch": 0.802020202020202, "grad_norm": 1.2006546705713226, "kl": 0.012060546875, "learning_rate": 2e-07, "loss": -0.00946882963180542, "memory(GiB)": 113.5, "reward": 0.3666666768491268, "reward_std": 0.21775851845741273, "rewards/MultiModalAccuracyORM/mean": 0.3666666768491268, "rewards/MultiModalAccuracyORM/std": 0.21775851845741273, "step": 1985, "train_speed(iter/s)": 0.031999 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 689.0, "completions/mean_length": 455.6916870117187, "completions/min_length": 271.2, "epoch": 0.804040404040404, "grad_norm": 1.7247663724146078, "kl": 0.0120758056640625, "learning_rate": 2e-07, "loss": -0.013834655284881592, "memory(GiB)": 113.5, "reward": 0.2916666746139526, "reward_std": 0.34933353662490846, "rewards/MultiModalAccuracyORM/mean": 0.2916666746139526, "rewards/MultiModalAccuracyORM/std": 0.34933353662490846, "step": 1990, "train_speed(iter/s)": 0.032003 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.3, "completions/mean_length": 388.1000122070312, "completions/min_length": 220.5, "epoch": 0.806060606060606, "grad_norm": 1.2751536443809328, "kl": 0.0211517333984375, "learning_rate": 2e-07, "loss": 0.026651501655578613, "memory(GiB)": 113.5, "reward": 0.2750000096857548, "reward_std": 0.29452561140060424, "rewards/MultiModalAccuracyORM/mean": 0.2750000096857548, "rewards/MultiModalAccuracyORM/std": 0.29452561140060424, "step": 1995, "train_speed(iter/s)": 0.032003 }, { "epoch": 0.8080808080808081, "grad_norm": 2.53993588975996, "learning_rate": 2e-07, "loss": 0.008918963372707367, "memory(GiB)": 113.5, "step": 2000, "train_speed(iter/s)": 0.032019 }, { "epoch": 0.8080808080808081, "eval_clip_ratio": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 640.5, "eval_completions/mean_length": 393.71500930786135, "eval_completions/min_length": 218.08, "eval_kl": 0.01480712890625, "eval_loss": 0.023003682494163513, "eval_reward": 0.30333334133028983, "eval_reward_std": 0.2836029249429703, "eval_rewards/MultiModalAccuracyORM/mean": 0.30333334133028983, "eval_rewards/MultiModalAccuracyORM/std": 0.2836029249429703, "eval_runtime": 625.7559, "eval_samples_per_second": 0.08, "eval_steps_per_second": 0.008, "step": 2000 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.1, "completions/mean_length": 439.12501831054686, "completions/min_length": 245.5, "epoch": 0.8101010101010101, "grad_norm": 1.574060720208308, "kl": 0.01459503173828125, "learning_rate": 2e-07, "loss": -0.005982875823974609, "memory(GiB)": 113.5, "reward": 0.33333334103226664, "reward_std": 0.3096754729747772, "rewards/MultiModalAccuracyORM/mean": 0.33333334103226664, "rewards/MultiModalAccuracyORM/std": 0.3096754729747772, "step": 2005, "train_speed(iter/s)": 0.031605 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.3, "completions/mean_length": 312.4916702270508, "completions/min_length": 184.5, "epoch": 0.8121212121212121, "grad_norm": 1.8875313816028536, "kl": 0.01746826171875, "learning_rate": 2e-07, "loss": 0.04548422992229462, "memory(GiB)": 113.5, "reward": 0.4666666842997074, "reward_std": 0.4252053827047348, "rewards/MultiModalAccuracyORM/mean": 0.4666666842997074, "rewards/MultiModalAccuracyORM/std": 0.4252053827047348, "step": 2010, "train_speed(iter/s)": 0.031626 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.2, "completions/mean_length": 329.9166778564453, "completions/min_length": 202.0, "epoch": 0.8141414141414142, "grad_norm": 1.858641750452265, "kl": 0.0157196044921875, "learning_rate": 2e-07, "loss": 0.023762321472167967, "memory(GiB)": 113.5, "reward": 0.40000001043081285, "reward_std": 0.3144780844449997, "rewards/MultiModalAccuracyORM/mean": 0.40000001043081285, "rewards/MultiModalAccuracyORM/std": 0.3144780844449997, "step": 2015, "train_speed(iter/s)": 0.031642 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 684.0, "completions/mean_length": 422.71667633056643, "completions/min_length": 200.3, "epoch": 0.8161616161616162, "grad_norm": 3.0722357868631334, "kl": 0.0186279296875, "learning_rate": 2e-07, "loss": -0.03257267475128174, "memory(GiB)": 113.5, "reward": 0.32500000968575476, "reward_std": 0.4204265087842941, "rewards/MultiModalAccuracyORM/mean": 0.32500000968575476, "rewards/MultiModalAccuracyORM/std": 0.4204265087842941, "step": 2020, "train_speed(iter/s)": 0.031653 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.7, "completions/mean_length": 355.5083511352539, "completions/min_length": 214.7, "epoch": 0.8181818181818182, "grad_norm": 2.729236730716231, "kl": 0.022601318359375, "learning_rate": 2e-07, "loss": -0.003387349843978882, "memory(GiB)": 113.5, "reward": 0.4250000074505806, "reward_std": 0.45008404850959777, "rewards/MultiModalAccuracyORM/mean": 0.4250000074505806, "rewards/MultiModalAccuracyORM/std": 0.45008404850959777, "step": 2025, "train_speed(iter/s)": 0.031658 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 653.6, "completions/mean_length": 421.4000183105469, "completions/min_length": 253.0, "epoch": 0.8202020202020202, "grad_norm": 1.744543874583184, "kl": 0.0112030029296875, "learning_rate": 2e-07, "loss": -0.013242574036121368, "memory(GiB)": 113.5, "reward": 0.1083333358168602, "reward_std": 0.29628167152404783, "rewards/MultiModalAccuracyORM/mean": 0.1083333358168602, "rewards/MultiModalAccuracyORM/std": 0.29628167152404783, "step": 2030, "train_speed(iter/s)": 0.031663 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.7, "completions/mean_length": 348.20834197998045, "completions/min_length": 219.4, "epoch": 0.8222222222222222, "grad_norm": 1.3474739820299675, "kl": 0.018927001953125, "learning_rate": 2e-07, "loss": 0.04633485376834869, "memory(GiB)": 113.5, "reward": 0.37500000596046446, "reward_std": 0.27622397541999816, "rewards/MultiModalAccuracyORM/mean": 0.37500000596046446, "rewards/MultiModalAccuracyORM/std": 0.27622397541999816, "step": 2035, "train_speed(iter/s)": 0.031672 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.4, "completions/mean_length": 350.50000915527346, "completions/min_length": 204.4, "epoch": 0.8242424242424242, "grad_norm": 1.5018646106657063, "kl": 0.019403076171875, "learning_rate": 2e-07, "loss": 0.030666446685791014, "memory(GiB)": 113.5, "reward": 0.49166667014360427, "reward_std": 0.32050161957740786, "rewards/MultiModalAccuracyORM/mean": 0.49166667014360427, "rewards/MultiModalAccuracyORM/std": 0.32050161957740786, "step": 2040, "train_speed(iter/s)": 0.031681 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.7, "completions/mean_length": 343.3500045776367, "completions/min_length": 214.3, "epoch": 0.8262626262626263, "grad_norm": 1.212062767454231, "kl": 0.0136077880859375, "learning_rate": 2e-07, "loss": 0.00010424554347991944, "memory(GiB)": 113.5, "reward": 0.2833333402872086, "reward_std": 0.3485885590314865, "rewards/MultiModalAccuracyORM/mean": 0.2833333402872086, "rewards/MultiModalAccuracyORM/std": 0.3485885590314865, "step": 2045, "train_speed(iter/s)": 0.031692 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/mean_length": 329.8250106811523, "completions/min_length": 163.3, "epoch": 0.8282828282828283, "grad_norm": 1.4699358421537125, "kl": 0.015545654296875, "learning_rate": 2e-07, "loss": 0.02045893669128418, "memory(GiB)": 113.5, "reward": 0.23333333656191826, "reward_std": 0.21999078392982482, "rewards/MultiModalAccuracyORM/mean": 0.23333333656191826, "rewards/MultiModalAccuracyORM/std": 0.21999078392982482, "step": 2050, "train_speed(iter/s)": 0.031697 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.3, "completions/mean_length": 291.33334197998045, "completions/min_length": 162.8, "epoch": 0.8303030303030303, "grad_norm": 1.4524213577819918, "kl": 0.0140533447265625, "learning_rate": 2e-07, "loss": 0.008110976219177246, "memory(GiB)": 113.5, "reward": 0.3166666738688946, "reward_std": 0.20369119048118592, "rewards/MultiModalAccuracyORM/mean": 0.3166666738688946, "rewards/MultiModalAccuracyORM/std": 0.20369119048118592, "step": 2055, "train_speed(iter/s)": 0.031712 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.6, "completions/mean_length": 289.6416732788086, "completions/min_length": 159.3, "epoch": 0.8323232323232324, "grad_norm": 4.225291158056462, "kl": 0.0178955078125, "learning_rate": 2e-07, "loss": 0.025725898146629334, "memory(GiB)": 113.5, "reward": 0.20000000447034835, "reward_std": 0.29414459466934206, "rewards/MultiModalAccuracyORM/mean": 0.20000000447034835, "rewards/MultiModalAccuracyORM/std": 0.29414459466934206, "step": 2060, "train_speed(iter/s)": 0.031722 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.6, "completions/mean_length": 311.5416793823242, "completions/min_length": 190.1, "epoch": 0.8343434343434344, "grad_norm": 2.6613790818964134, "kl": 0.021978759765625, "learning_rate": 2e-07, "loss": 0.006576963514089584, "memory(GiB)": 113.5, "reward": 0.2916666753590107, "reward_std": 0.40155683159828187, "rewards/MultiModalAccuracyORM/mean": 0.2916666753590107, "rewards/MultiModalAccuracyORM/std": 0.40155683159828187, "step": 2065, "train_speed(iter/s)": 0.03173 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.6, "completions/mean_length": 315.6750038146973, "completions/min_length": 170.0, "epoch": 0.8363636363636363, "grad_norm": 1.5051205676406512, "kl": 0.0233642578125, "learning_rate": 2e-07, "loss": 0.09363476037979127, "memory(GiB)": 113.5, "reward": 0.508333345502615, "reward_std": 0.2822715103626251, "rewards/MultiModalAccuracyORM/mean": 0.508333345502615, "rewards/MultiModalAccuracyORM/std": 0.2822715103626251, "step": 2070, "train_speed(iter/s)": 0.03174 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.5, "completions/mean_length": 375.75000762939453, "completions/min_length": 224.4, "epoch": 0.8383838383838383, "grad_norm": 1.8057057513107744, "kl": 0.018963623046875, "learning_rate": 2e-07, "loss": -0.023636098206043243, "memory(GiB)": 113.5, "reward": 0.4000000111758709, "reward_std": 0.33306954205036166, "rewards/MultiModalAccuracyORM/mean": 0.4000000111758709, "rewards/MultiModalAccuracyORM/std": 0.33306954205036166, "step": 2075, "train_speed(iter/s)": 0.031751 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 704.7, "completions/mean_length": 414.0416793823242, "completions/min_length": 194.4, "epoch": 0.8404040404040404, "grad_norm": 1.7580953588231485, "kl": 0.015985107421875, "learning_rate": 2e-07, "loss": 0.004860112071037292, "memory(GiB)": 113.5, "reward": 0.33333334028720857, "reward_std": 0.30333785712718964, "rewards/MultiModalAccuracyORM/mean": 0.33333334028720857, "rewards/MultiModalAccuracyORM/std": 0.30333785712718964, "step": 2080, "train_speed(iter/s)": 0.031761 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.9, "completions/mean_length": 372.9250122070313, "completions/min_length": 192.9, "epoch": 0.8424242424242424, "grad_norm": 1.6888227745633726, "kl": 0.018841552734375, "learning_rate": 2e-07, "loss": 0.0038746654987335204, "memory(GiB)": 113.5, "reward": 0.40000001564621923, "reward_std": 0.3948384612798691, "rewards/MultiModalAccuracyORM/mean": 0.40000001564621923, "rewards/MultiModalAccuracyORM/std": 0.3948384612798691, "step": 2085, "train_speed(iter/s)": 0.031778 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.5, "completions/mean_length": 359.8833480834961, "completions/min_length": 211.2, "epoch": 0.8444444444444444, "grad_norm": 2.0473894442291605, "kl": 0.0135040283203125, "learning_rate": 2e-07, "loss": -0.005132901668548584, "memory(GiB)": 113.5, "reward": 0.4833333469927311, "reward_std": 0.38904850780963895, "rewards/MultiModalAccuracyORM/mean": 0.4833333469927311, "rewards/MultiModalAccuracyORM/std": 0.38904850780963895, "step": 2090, "train_speed(iter/s)": 0.031799 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 740.2, "completions/mean_length": 453.96668243408203, "completions/min_length": 251.3, "epoch": 0.8464646464646465, "grad_norm": 1.9528281428716412, "kl": 0.015765380859375, "learning_rate": 2e-07, "loss": -0.00459083616733551, "memory(GiB)": 113.5, "reward": 0.3416666783392429, "reward_std": 0.4211809396743774, "rewards/MultiModalAccuracyORM/mean": 0.3416666783392429, "rewards/MultiModalAccuracyORM/std": 0.4211809396743774, "step": 2095, "train_speed(iter/s)": 0.031796 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.1, "completions/mean_length": 360.11667785644534, "completions/min_length": 181.9, "epoch": 0.8484848484848485, "grad_norm": 1.6610621083165809, "kl": 0.0178131103515625, "learning_rate": 2e-07, "loss": 0.0021423667669296263, "memory(GiB)": 113.5, "reward": 0.31666667237877844, "reward_std": 0.33000870048999786, "rewards/MultiModalAccuracyORM/mean": 0.31666667237877844, "rewards/MultiModalAccuracyORM/std": 0.33000870048999786, "step": 2100, "train_speed(iter/s)": 0.031806 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.3, "completions/mean_length": 264.67501068115234, "completions/min_length": 136.9, "epoch": 0.8505050505050505, "grad_norm": 2.0696467764003192, "kl": 0.163067626953125, "learning_rate": 2e-07, "loss": 0.0025389432907104493, "memory(GiB)": 113.5, "reward": 0.5000000096857548, "reward_std": 0.22625694572925567, "rewards/MultiModalAccuracyORM/mean": 0.5000000096857548, "rewards/MultiModalAccuracyORM/std": 0.22625694572925567, "step": 2105, "train_speed(iter/s)": 0.03182 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.7, "completions/mean_length": 341.6750091552734, "completions/min_length": 188.4, "epoch": 0.8525252525252526, "grad_norm": 1.7692403149903426, "kl": 0.0196319580078125, "learning_rate": 2e-07, "loss": 0.016690313816070557, "memory(GiB)": 113.5, "reward": 0.3583333343267441, "reward_std": 0.21292004883289337, "rewards/MultiModalAccuracyORM/mean": 0.3583333343267441, "rewards/MultiModalAccuracyORM/std": 0.21292004883289337, "step": 2110, "train_speed(iter/s)": 0.031829 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 717.9, "completions/mean_length": 410.6416839599609, "completions/min_length": 215.5, "epoch": 0.8545454545454545, "grad_norm": 1.0155490841614827, "kl": 0.0239013671875, "learning_rate": 2e-07, "loss": 0.06116962432861328, "memory(GiB)": 113.5, "reward": 0.3000000067055225, "reward_std": 0.3330695390701294, "rewards/MultiModalAccuracyORM/mean": 0.3000000067055225, "rewards/MultiModalAccuracyORM/std": 0.3330695390701294, "step": 2115, "train_speed(iter/s)": 0.031834 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.008333333333333333, "completions/max_length": 756.7, "completions/mean_length": 427.03334350585936, "completions/min_length": 240.2, "epoch": 0.8565656565656565, "grad_norm": 1.5010329222185153, "kl": 0.0179931640625, "learning_rate": 2e-07, "loss": 0.008207672834396362, "memory(GiB)": 113.5, "reward": 0.3000000067055225, "reward_std": 0.26822818219661715, "rewards/MultiModalAccuracyORM/mean": 0.3000000067055225, "rewards/MultiModalAccuracyORM/std": 0.26822818219661715, "step": 2120, "train_speed(iter/s)": 0.031834 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/mean_length": 278.35834350585935, "completions/min_length": 133.5, "epoch": 0.8585858585858586, "grad_norm": 2.4751168878714296, "kl": 0.0186279296875, "learning_rate": 2e-07, "loss": 0.002880534529685974, "memory(GiB)": 113.5, "reward": 0.44166667833924295, "reward_std": 0.26897315979003905, "rewards/MultiModalAccuracyORM/mean": 0.44166667833924295, "rewards/MultiModalAccuracyORM/std": 0.26897315979003905, "step": 2125, "train_speed(iter/s)": 0.03185 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.008333333333333333, "completions/max_length": 778.6, "completions/mean_length": 397.366682434082, "completions/min_length": 238.8, "epoch": 0.8606060606060606, "grad_norm": 2.5358484406016406, "kl": 0.024908447265625, "learning_rate": 2e-07, "loss": -0.008894717693328858, "memory(GiB)": 113.5, "reward": 0.4166666753590107, "reward_std": 0.39010730385780334, "rewards/MultiModalAccuracyORM/mean": 0.4166666753590107, "rewards/MultiModalAccuracyORM/std": 0.39010730385780334, "step": 2130, "train_speed(iter/s)": 0.031849 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.6, "completions/mean_length": 347.92500762939454, "completions/min_length": 210.0, "epoch": 0.8626262626262626, "grad_norm": 1.4480874521635712, "kl": 0.013836669921875, "learning_rate": 2e-07, "loss": -0.02624996304512024, "memory(GiB)": 113.5, "reward": 0.24166666865348815, "reward_std": 0.2815766751766205, "rewards/MultiModalAccuracyORM/mean": 0.24166666865348815, "rewards/MultiModalAccuracyORM/std": 0.2815766751766205, "step": 2135, "train_speed(iter/s)": 0.031857 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.008333333333333333, "completions/max_length": 792.0, "completions/mean_length": 399.3750129699707, "completions/min_length": 233.7, "epoch": 0.8646464646464647, "grad_norm": 2.3120304434709595, "kl": 0.01986083984375, "learning_rate": 2e-07, "loss": -0.004719728231430053, "memory(GiB)": 113.5, "reward": 0.22500000149011612, "reward_std": 0.22384164929389955, "rewards/MultiModalAccuracyORM/mean": 0.22500000149011612, "rewards/MultiModalAccuracyORM/std": 0.22384164929389955, "step": 2140, "train_speed(iter/s)": 0.03185 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 787.2, "completions/mean_length": 441.508349609375, "completions/min_length": 236.5, "epoch": 0.8666666666666667, "grad_norm": 2.17037271282662, "kl": 0.01793212890625, "learning_rate": 2e-07, "loss": -0.012784427404403687, "memory(GiB)": 113.5, "reward": 0.23333333432674408, "reward_std": 0.2581467509269714, "rewards/MultiModalAccuracyORM/mean": 0.23333333432674408, "rewards/MultiModalAccuracyORM/std": 0.2581467509269714, "step": 2145, "train_speed(iter/s)": 0.03185 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.1, "completions/mean_length": 304.8583419799805, "completions/min_length": 197.2, "epoch": 0.8686868686868687, "grad_norm": 2.4684483286798313, "kl": 0.018658447265625, "learning_rate": 2e-07, "loss": -0.013285607099533081, "memory(GiB)": 113.5, "reward": 0.4250000067055225, "reward_std": 0.3696640759706497, "rewards/MultiModalAccuracyORM/mean": 0.4250000067055225, "rewards/MultiModalAccuracyORM/std": 0.3696640759706497, "step": 2150, "train_speed(iter/s)": 0.031865 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.6, "completions/mean_length": 357.44168243408205, "completions/min_length": 206.1, "epoch": 0.8707070707070707, "grad_norm": 2.4866065792724794, "kl": 0.01856689453125, "learning_rate": 2e-07, "loss": -0.014015734195709229, "memory(GiB)": 113.5, "reward": 0.23333334177732468, "reward_std": 0.24436976611614228, "rewards/MultiModalAccuracyORM/mean": 0.23333334177732468, "rewards/MultiModalAccuracyORM/std": 0.24436976611614228, "step": 2155, "train_speed(iter/s)": 0.03187 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.2, "completions/mean_length": 316.7916778564453, "completions/min_length": 156.8, "epoch": 0.8727272727272727, "grad_norm": 0.10342060356020474, "kl": 0.019561767578125, "learning_rate": 2e-07, "loss": 0.015072919428348541, "memory(GiB)": 113.5, "reward": 0.5250000067055225, "reward_std": 0.23303491175174712, "rewards/MultiModalAccuracyORM/mean": 0.5250000067055225, "rewards/MultiModalAccuracyORM/std": 0.23303491175174712, "step": 2160, "train_speed(iter/s)": 0.031885 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.5, "completions/mean_length": 359.40834350585936, "completions/min_length": 213.1, "epoch": 0.8747474747474747, "grad_norm": 2.001531798018373, "kl": 0.02108154296875, "learning_rate": 2e-07, "loss": 0.03580483496189117, "memory(GiB)": 113.5, "reward": 0.2916666708886623, "reward_std": 0.37593023777008056, "rewards/MultiModalAccuracyORM/mean": 0.2916666708886623, "rewards/MultiModalAccuracyORM/std": 0.37593023777008056, "step": 2165, "train_speed(iter/s)": 0.03189 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.1, "completions/mean_length": 376.5666778564453, "completions/min_length": 215.1, "epoch": 0.8767676767676768, "grad_norm": 1.7513379048306494, "kl": 0.01739501953125, "learning_rate": 2e-07, "loss": -0.001603315770626068, "memory(GiB)": 113.5, "reward": 0.35833333656191824, "reward_std": 0.27927026748657224, "rewards/MultiModalAccuracyORM/mean": 0.35833333656191824, "rewards/MultiModalAccuracyORM/std": 0.27927026748657224, "step": 2170, "train_speed(iter/s)": 0.031901 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.4, "completions/mean_length": 326.4833419799805, "completions/min_length": 178.7, "epoch": 0.8787878787878788, "grad_norm": 1.8257626566364757, "kl": 0.01832275390625, "learning_rate": 2e-07, "loss": -0.0064360305666923525, "memory(GiB)": 113.5, "reward": 0.4000000096857548, "reward_std": 0.2528681933879852, "rewards/MultiModalAccuracyORM/mean": 0.4000000096857548, "rewards/MultiModalAccuracyORM/std": 0.2528681933879852, "step": 2175, "train_speed(iter/s)": 0.031922 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 654.1, "completions/mean_length": 443.8500183105469, "completions/min_length": 252.4, "epoch": 0.8808080808080808, "grad_norm": 2.0080312898238777, "kl": 0.0168212890625, "learning_rate": 2e-07, "loss": 0.003071814775466919, "memory(GiB)": 113.5, "reward": 0.25000001341104505, "reward_std": 0.27749558687210085, "rewards/MultiModalAccuracyORM/mean": 0.25000001341104505, "rewards/MultiModalAccuracyORM/std": 0.27749558687210085, "step": 2180, "train_speed(iter/s)": 0.031939 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.9, "completions/mean_length": 286.4000068664551, "completions/min_length": 137.1, "epoch": 0.8828282828282829, "grad_norm": 2.6881230452510176, "kl": 0.0223968505859375, "learning_rate": 2e-07, "loss": 0.0006526708602905273, "memory(GiB)": 113.5, "reward": 0.5333333387970924, "reward_std": 0.20369119048118592, "rewards/MultiModalAccuracyORM/mean": 0.5333333387970924, "rewards/MultiModalAccuracyORM/std": 0.20369119048118592, "step": 2185, "train_speed(iter/s)": 0.031947 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 610.5, "completions/mean_length": 402.3166793823242, "completions/min_length": 210.6, "epoch": 0.8848484848484849, "grad_norm": 2.1762586962994126, "kl": 0.02080078125, "learning_rate": 2e-07, "loss": 0.01941031664609909, "memory(GiB)": 113.5, "reward": 0.13333333805203437, "reward_std": 0.28399197161197665, "rewards/MultiModalAccuracyORM/mean": 0.13333333805203437, "rewards/MultiModalAccuracyORM/std": 0.28399197161197665, "step": 2190, "train_speed(iter/s)": 0.031947 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.7, "completions/mean_length": 376.9000160217285, "completions/min_length": 233.4, "epoch": 0.8868686868686869, "grad_norm": 1.5099724310943463, "kl": 0.013751220703125, "learning_rate": 2e-07, "loss": 0.018771827220916748, "memory(GiB)": 113.5, "reward": 0.3083333395421505, "reward_std": 0.22406027615070342, "rewards/MultiModalAccuracyORM/mean": 0.3083333395421505, "rewards/MultiModalAccuracyORM/std": 0.22406027615070342, "step": 2195, "train_speed(iter/s)": 0.031954 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.008333333333333333, "completions/max_length": 564.0, "completions/mean_length": 270.4416732788086, "completions/min_length": 142.9, "epoch": 0.8888888888888888, "grad_norm": 2.2743682671690997, "kl": 0.023577880859375, "learning_rate": 2e-07, "loss": 0.025069376826286315, "memory(GiB)": 113.5, "reward": 0.41666666939854624, "reward_std": 0.34936913251876833, "rewards/MultiModalAccuracyORM/mean": 0.41666666939854624, "rewards/MultiModalAccuracyORM/std": 0.34936913251876833, "step": 2200, "train_speed(iter/s)": 0.031957 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.7, "completions/mean_length": 299.5250076293945, "completions/min_length": 173.3, "epoch": 0.8909090909090909, "grad_norm": 1.7147767163429606, "kl": 0.015533447265625, "learning_rate": 2e-07, "loss": -0.01650981158018112, "memory(GiB)": 113.5, "reward": 0.3666666761040688, "reward_std": 0.26142621636390684, "rewards/MultiModalAccuracyORM/mean": 0.3666666761040688, "rewards/MultiModalAccuracyORM/std": 0.26142621636390684, "step": 2205, "train_speed(iter/s)": 0.031967 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.008333333333333333, "completions/max_length": 678.7, "completions/mean_length": 303.3583396911621, "completions/min_length": 148.9, "epoch": 0.8929292929292929, "grad_norm": 0.07977564640032515, "kl": 0.022882080078125, "learning_rate": 2e-07, "loss": -0.015148724615573882, "memory(GiB)": 113.5, "reward": 0.2750000052154064, "reward_std": 0.2333131343126297, "rewards/MultiModalAccuracyORM/mean": 0.2750000052154064, "rewards/MultiModalAccuracyORM/std": 0.2333131343126297, "step": 2210, "train_speed(iter/s)": 0.031965 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 739.5, "completions/mean_length": 388.18333740234374, "completions/min_length": 198.5, "epoch": 0.8949494949494949, "grad_norm": 0.8099900753838608, "kl": 0.0284912109375, "learning_rate": 2e-07, "loss": 0.00753181129693985, "memory(GiB)": 113.5, "reward": 0.416666679084301, "reward_std": 0.34156554043292997, "rewards/MultiModalAccuracyORM/mean": 0.416666679084301, "rewards/MultiModalAccuracyORM/std": 0.34156554043292997, "step": 2215, "train_speed(iter/s)": 0.03197 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.025, "completions/max_length": 779.3, "completions/mean_length": 450.6333526611328, "completions/min_length": 239.0, "epoch": 0.896969696969697, "grad_norm": 1.7310208765669708, "kl": 0.0198486328125, "learning_rate": 2e-07, "loss": -0.004081086814403534, "memory(GiB)": 113.5, "reward": 0.2500000074505806, "reward_std": 0.3800142765045166, "rewards/MultiModalAccuracyORM/mean": 0.2500000074505806, "rewards/MultiModalAccuracyORM/std": 0.3800142765045166, "step": 2220, "train_speed(iter/s)": 0.03197 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.008333333333333333, "completions/max_length": 676.4, "completions/mean_length": 360.75001220703126, "completions/min_length": 217.2, "epoch": 0.898989898989899, "grad_norm": 2.1020973545612702, "kl": 0.01806640625, "learning_rate": 2e-07, "loss": 0.03712728023529053, "memory(GiB)": 113.5, "reward": 0.3000000134110451, "reward_std": 0.32673218548297883, "rewards/MultiModalAccuracyORM/mean": 0.3000000134110451, "rewards/MultiModalAccuracyORM/std": 0.32673218548297883, "step": 2225, "train_speed(iter/s)": 0.031975 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.008333333333333333, "completions/max_length": 820.2, "completions/mean_length": 446.9583396911621, "completions/min_length": 246.3, "epoch": 0.901010101010101, "grad_norm": 1.2524422904219505, "kl": 0.016302490234375, "learning_rate": 2e-07, "loss": -0.02771589457988739, "memory(GiB)": 113.5, "reward": 0.32500000596046447, "reward_std": 0.31088480055332185, "rewards/MultiModalAccuracyORM/mean": 0.32500000596046447, "rewards/MultiModalAccuracyORM/std": 0.31088480055332185, "step": 2230, "train_speed(iter/s)": 0.031963 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.1, "completions/mean_length": 314.425008392334, "completions/min_length": 211.4, "epoch": 0.9030303030303031, "grad_norm": 1.941568088508899, "kl": 0.0159149169921875, "learning_rate": 2e-07, "loss": 0.03777821063995361, "memory(GiB)": 113.5, "reward": 0.450000011920929, "reward_std": 0.391499400138855, "rewards/MultiModalAccuracyORM/mean": 0.450000011920929, "rewards/MultiModalAccuracyORM/std": 0.391499400138855, "step": 2235, "train_speed(iter/s)": 0.031975 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.2, "completions/mean_length": 344.6333435058594, "completions/min_length": 198.5, "epoch": 0.9050505050505051, "grad_norm": 2.0763848655087673, "kl": 0.019061279296875, "learning_rate": 2e-07, "loss": -0.0011584073305130004, "memory(GiB)": 113.5, "reward": 0.40000000670552255, "reward_std": 0.34407602846622465, "rewards/MultiModalAccuracyORM/mean": 0.40000000670552255, "rewards/MultiModalAccuracyORM/std": 0.34407602846622465, "step": 2240, "train_speed(iter/s)": 0.031993 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.4, "completions/mean_length": 319.63334197998046, "completions/min_length": 160.3, "epoch": 0.907070707070707, "grad_norm": 2.59722388457303, "kl": 0.022515869140625, "learning_rate": 2e-07, "loss": -0.017506715655326844, "memory(GiB)": 113.5, "reward": 0.27500001043081285, "reward_std": 0.3227818846702576, "rewards/MultiModalAccuracyORM/mean": 0.27500001043081285, "rewards/MultiModalAccuracyORM/std": 0.3227818846702576, "step": 2245, "train_speed(iter/s)": 0.031998 }, { "epoch": 0.9090909090909091, "grad_norm": 1.3781323461012882, "learning_rate": 2e-07, "loss": 0.01341366171836853, "memory(GiB)": 113.5, "step": 2250, "train_speed(iter/s)": 0.032003 }, { "epoch": 0.9090909090909091, "eval_clip_ratio": 0.0, "eval_completions/clipped_ratio": 0.0016666666666666666, "eval_completions/max_length": 642.72, "eval_completions/mean_length": 376.58501220703124, "eval_completions/min_length": 201.48, "eval_kl": 0.01755615234375, "eval_loss": 0.022878510877490044, "eval_reward": 0.3366666728258133, "eval_reward_std": 0.29963068544864657, "eval_rewards/MultiModalAccuracyORM/mean": 0.3366666728258133, "eval_rewards/MultiModalAccuracyORM/std": 0.29963068544864657, "eval_runtime": 620.6156, "eval_samples_per_second": 0.081, "eval_steps_per_second": 0.008, "step": 2250 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 717.55, "completions/mean_length": 408.7458442687988, "completions/min_length": 214.2, "epoch": 0.9111111111111111, "grad_norm": 0.0911724129495613, "kl": 0.01767578125, "learning_rate": 2e-07, "loss": 0.05687015056610108, "memory(GiB)": 113.5, "reward": 0.3166666738688946, "reward_std": 0.32789033353328706, "rewards/MultiModalAccuracyORM/mean": 0.3166666738688946, "rewards/MultiModalAccuracyORM/std": 0.32789033353328706, "step": 2255, "train_speed(iter/s)": 0.031634 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.008333333333333333, "completions/max_length": 646.3, "completions/mean_length": 310.35000762939455, "completions/min_length": 147.8, "epoch": 0.9131313131313131, "grad_norm": 2.3215328543725837, "kl": 0.02152099609375, "learning_rate": 2e-07, "loss": -0.02131924331188202, "memory(GiB)": 113.5, "reward": 0.3750000037252903, "reward_std": 0.2659719169139862, "rewards/MultiModalAccuracyORM/mean": 0.3750000037252903, "rewards/MultiModalAccuracyORM/std": 0.2659719169139862, "step": 2260, "train_speed(iter/s)": 0.03164 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.3, "completions/mean_length": 371.5416793823242, "completions/min_length": 220.7, "epoch": 0.9151515151515152, "grad_norm": 2.126621344773754, "kl": 0.018896484375, "learning_rate": 2e-07, "loss": 0.024756547808647156, "memory(GiB)": 113.5, "reward": 0.2750000052154064, "reward_std": 0.2619264245033264, "rewards/MultiModalAccuracyORM/mean": 0.2750000052154064, "rewards/MultiModalAccuracyORM/std": 0.2619264245033264, "step": 2265, "train_speed(iter/s)": 0.031637 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.4, "completions/mean_length": 415.3916778564453, "completions/min_length": 254.9, "epoch": 0.9171717171717172, "grad_norm": 2.9790243495572137, "kl": 0.0215087890625, "learning_rate": 2e-07, "loss": -0.012356171011924743, "memory(GiB)": 113.5, "reward": 0.1666666679084301, "reward_std": 0.27520077526569364, "rewards/MultiModalAccuracyORM/mean": 0.1666666679084301, "rewards/MultiModalAccuracyORM/std": 0.27520077526569364, "step": 2270, "train_speed(iter/s)": 0.031641 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.008333333333333333, "completions/max_length": 771.9, "completions/mean_length": 425.84167633056643, "completions/min_length": 242.2, "epoch": 0.9191919191919192, "grad_norm": 1.0861715717638791, "kl": 0.0269012451171875, "learning_rate": 2e-07, "loss": 0.010645134747028351, "memory(GiB)": 113.5, "reward": 0.2583333425223827, "reward_std": 0.30260742604732516, "rewards/MultiModalAccuracyORM/mean": 0.2583333425223827, "rewards/MultiModalAccuracyORM/std": 0.30260742604732516, "step": 2275, "train_speed(iter/s)": 0.031636 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.9, "completions/mean_length": 271.05000762939454, "completions/min_length": 149.0, "epoch": 0.9212121212121213, "grad_norm": 0.05924941442962051, "kl": 0.0225830078125, "learning_rate": 2e-07, "loss": 0.031935521960258485, "memory(GiB)": 113.5, "reward": 0.28333333805203437, "reward_std": 0.304396653175354, "rewards/MultiModalAccuracyORM/mean": 0.28333333805203437, "rewards/MultiModalAccuracyORM/std": 0.304396653175354, "step": 2280, "train_speed(iter/s)": 0.031652 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 711.7, "completions/mean_length": 391.0416763305664, "completions/min_length": 203.9, "epoch": 0.9232323232323232, "grad_norm": 2.8587723324821566, "kl": 0.024896240234375, "learning_rate": 2e-07, "loss": 0.017455708980560303, "memory(GiB)": 113.5, "reward": 0.33333333805203436, "reward_std": 0.29177859127521516, "rewards/MultiModalAccuracyORM/mean": 0.33333333805203436, "rewards/MultiModalAccuracyORM/std": 0.29177859127521516, "step": 2285, "train_speed(iter/s)": 0.031656 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.5, "completions/mean_length": 356.05834197998047, "completions/min_length": 179.4, "epoch": 0.9252525252525252, "grad_norm": 3.219718675307709, "kl": 0.0145965576171875, "learning_rate": 2e-07, "loss": -0.032944440841674805, "memory(GiB)": 113.5, "reward": 0.3000000067055225, "reward_std": 0.2840515673160553, "rewards/MultiModalAccuracyORM/mean": 0.3000000067055225, "rewards/MultiModalAccuracyORM/std": 0.2840515673160553, "step": 2290, "train_speed(iter/s)": 0.031663 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 663.9, "completions/mean_length": 448.50000762939453, "completions/min_length": 247.4, "epoch": 0.9272727272727272, "grad_norm": 0.8757317301258869, "kl": 0.0155517578125, "learning_rate": 2e-07, "loss": -0.008566761016845703, "memory(GiB)": 113.5, "reward": 0.2833333432674408, "reward_std": 0.27596975266933443, "rewards/MultiModalAccuracyORM/mean": 0.2833333432674408, "rewards/MultiModalAccuracyORM/std": 0.27596975266933443, "step": 2295, "train_speed(iter/s)": 0.031671 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 786.8, "completions/mean_length": 507.6833526611328, "completions/min_length": 320.1, "epoch": 0.9292929292929293, "grad_norm": 0.9362166291898165, "kl": 0.02156982421875, "learning_rate": 2e-07, "loss": 0.018462255597114563, "memory(GiB)": 113.5, "reward": 0.3083333373069763, "reward_std": 0.2464074045419693, "rewards/MultiModalAccuracyORM/mean": 0.3083333373069763, "rewards/MultiModalAccuracyORM/std": 0.2464074045419693, "step": 2300, "train_speed(iter/s)": 0.03166 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.9, "completions/mean_length": 257.65000381469724, "completions/min_length": 128.1, "epoch": 0.9313131313131313, "grad_norm": 2.658818675138077, "kl": 0.029730224609375, "learning_rate": 2e-07, "loss": 0.023678554594516753, "memory(GiB)": 113.5, "reward": 0.30000000521540643, "reward_std": 0.15821026563644408, "rewards/MultiModalAccuracyORM/mean": 0.30000000521540643, "rewards/MultiModalAccuracyORM/std": 0.15821026563644408, "step": 2305, "train_speed(iter/s)": 0.031671 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.008333333333333333, "completions/max_length": 800.1, "completions/mean_length": 461.5750137329102, "completions/min_length": 264.1, "epoch": 0.9333333333333333, "grad_norm": 1.525329758838897, "kl": 0.0238433837890625, "learning_rate": 2e-07, "loss": 0.016385090351104737, "memory(GiB)": 113.5, "reward": 0.1666666693985462, "reward_std": 0.3190022110939026, "rewards/MultiModalAccuracyORM/mean": 0.1666666693985462, "rewards/MultiModalAccuracyORM/std": 0.3190022110939026, "step": 2310, "train_speed(iter/s)": 0.031667 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/mean_length": 360.6000061035156, "completions/min_length": 211.2, "epoch": 0.9353535353535354, "grad_norm": 2.555254446139955, "kl": 0.01925048828125, "learning_rate": 2e-07, "loss": -0.025304621458053587, "memory(GiB)": 113.5, "reward": 0.24166667088866234, "reward_std": 0.309637188911438, "rewards/MultiModalAccuracyORM/mean": 0.24166667088866234, "rewards/MultiModalAccuracyORM/std": 0.309637188911438, "step": 2315, "train_speed(iter/s)": 0.031675 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 662.0, "completions/mean_length": 406.8916778564453, "completions/min_length": 230.6, "epoch": 0.9373737373737374, "grad_norm": 1.8305198392785023, "kl": 0.0230133056640625, "learning_rate": 2e-07, "loss": -0.014680406451225281, "memory(GiB)": 113.5, "reward": 0.3500000037252903, "reward_std": 0.3111986190080643, "rewards/MultiModalAccuracyORM/mean": 0.3500000037252903, "rewards/MultiModalAccuracyORM/std": 0.3111986190080643, "step": 2320, "train_speed(iter/s)": 0.031676 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.8, "completions/mean_length": 282.15833892822263, "completions/min_length": 162.0, "epoch": 0.9393939393939394, "grad_norm": 2.852841229555507, "kl": 0.02640380859375, "learning_rate": 2e-07, "loss": 0.0326883852481842, "memory(GiB)": 113.5, "reward": 0.3083333432674408, "reward_std": 0.4167425513267517, "rewards/MultiModalAccuracyORM/mean": 0.3083333432674408, "rewards/MultiModalAccuracyORM/std": 0.4167425513267517, "step": 2325, "train_speed(iter/s)": 0.031687 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.008333333333333333, "completions/max_length": 790.1, "completions/mean_length": 384.29168395996095, "completions/min_length": 177.1, "epoch": 0.9414141414141414, "grad_norm": 2.0178414254144723, "kl": 0.018560791015625, "learning_rate": 2e-07, "loss": 0.008831435441970825, "memory(GiB)": 113.5, "reward": 0.38333334028720856, "reward_std": 0.36893364489078523, "rewards/MultiModalAccuracyORM/mean": 0.38333334028720856, "rewards/MultiModalAccuracyORM/std": 0.36893364489078523, "step": 2330, "train_speed(iter/s)": 0.031685 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.008333333333333333, "completions/max_length": 679.0, "completions/mean_length": 368.05834503173827, "completions/min_length": 206.5, "epoch": 0.9434343434343434, "grad_norm": 1.5228784773962754, "kl": 0.015521240234375, "learning_rate": 2e-07, "loss": -0.008360534906387329, "memory(GiB)": 113.5, "reward": 0.3083333373069763, "reward_std": 0.3352662086486816, "rewards/MultiModalAccuracyORM/mean": 0.3083333373069763, "rewards/MultiModalAccuracyORM/std": 0.3352662086486816, "step": 2335, "train_speed(iter/s)": 0.031686 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.008333333333333333, "completions/max_length": 1030.2, "completions/mean_length": 481.2666900634766, "completions/min_length": 237.4, "epoch": 0.9454545454545454, "grad_norm": 1.418697346446445, "kl": 0.0329925537109375, "learning_rate": 2e-07, "loss": 0.0726934552192688, "memory(GiB)": 113.5, "reward": 0.2833333395421505, "reward_std": 0.3713845372200012, "rewards/MultiModalAccuracyORM/mean": 0.2833333395421505, "rewards/MultiModalAccuracyORM/std": 0.3713845372200012, "step": 2340, "train_speed(iter/s)": 0.031672 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.008333333333333333, "completions/max_length": 668.9, "completions/mean_length": 321.70001220703125, "completions/min_length": 150.2, "epoch": 0.9474747474747475, "grad_norm": 2.0538342098414333, "kl": 0.03148193359375, "learning_rate": 2e-07, "loss": 0.035471782088279724, "memory(GiB)": 113.5, "reward": 0.29166667610406877, "reward_std": 0.1973894327878952, "rewards/MultiModalAccuracyORM/mean": 0.29166667610406877, "rewards/MultiModalAccuracyORM/std": 0.1973894327878952, "step": 2345, "train_speed(iter/s)": 0.031674 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.4, "completions/mean_length": 358.76668243408204, "completions/min_length": 218.4, "epoch": 0.9494949494949495, "grad_norm": 2.6339218970926903, "kl": 0.0245635986328125, "learning_rate": 2e-07, "loss": 0.004336267709732056, "memory(GiB)": 113.5, "reward": 0.36666667833924294, "reward_std": 0.32297651171684266, "rewards/MultiModalAccuracyORM/mean": 0.36666667833924294, "rewards/MultiModalAccuracyORM/std": 0.32297651171684266, "step": 2350, "train_speed(iter/s)": 0.031691 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.4, "completions/mean_length": 282.98334274291994, "completions/min_length": 158.3, "epoch": 0.9515151515151515, "grad_norm": 2.0291656458591145, "kl": 0.020587158203125, "learning_rate": 2e-07, "loss": -0.05831232666969299, "memory(GiB)": 113.5, "reward": 0.5000000081956386, "reward_std": 0.3330099433660507, "rewards/MultiModalAccuracyORM/mean": 0.5000000081956386, "rewards/MultiModalAccuracyORM/std": 0.3330099433660507, "step": 2355, "train_speed(iter/s)": 0.031702 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.3, "completions/mean_length": 282.77500610351564, "completions/min_length": 139.0, "epoch": 0.9535353535353536, "grad_norm": 0.11573786538748869, "kl": 0.0304229736328125, "learning_rate": 2e-07, "loss": 0.03489102721214295, "memory(GiB)": 113.5, "reward": 0.24166666939854622, "reward_std": 0.2355453997850418, "rewards/MultiModalAccuracyORM/mean": 0.24166666939854622, "rewards/MultiModalAccuracyORM/std": 0.2355453997850418, "step": 2360, "train_speed(iter/s)": 0.031718 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.4, "completions/mean_length": 412.06668243408205, "completions/min_length": 245.3, "epoch": 0.9555555555555556, "grad_norm": 2.17622948866824, "kl": 0.019061279296875, "learning_rate": 2e-07, "loss": 0.005562397837638855, "memory(GiB)": 113.5, "reward": 0.4250000111758709, "reward_std": 0.45383972525596616, "rewards/MultiModalAccuracyORM/mean": 0.4250000111758709, "rewards/MultiModalAccuracyORM/std": 0.45383972525596616, "step": 2365, "train_speed(iter/s)": 0.031722 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.1, "completions/mean_length": 371.3166793823242, "completions/min_length": 208.3, "epoch": 0.9575757575757575, "grad_norm": 2.3917395292059282, "kl": 0.031591796875, "learning_rate": 2e-07, "loss": 0.00018071085214614867, "memory(GiB)": 113.5, "reward": 0.291666679084301, "reward_std": 0.26498726308345794, "rewards/MultiModalAccuracyORM/mean": 0.291666679084301, "rewards/MultiModalAccuracyORM/std": 0.26498726308345794, "step": 2370, "train_speed(iter/s)": 0.031735 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.6, "completions/mean_length": 301.4333435058594, "completions/min_length": 178.5, "epoch": 0.9595959595959596, "grad_norm": 3.5970167327213822, "kl": 0.02054443359375, "learning_rate": 2e-07, "loss": 0.01565767079591751, "memory(GiB)": 113.5, "reward": 0.44166667833924295, "reward_std": 0.26897316575050356, "rewards/MultiModalAccuracyORM/mean": 0.44166667833924295, "rewards/MultiModalAccuracyORM/std": 0.26897316575050356, "step": 2375, "train_speed(iter/s)": 0.031751 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.8, "completions/mean_length": 327.98334350585935, "completions/min_length": 184.6, "epoch": 0.9616161616161616, "grad_norm": 2.197976826013823, "kl": 0.021331787109375, "learning_rate": 2e-07, "loss": 0.005569913983345031, "memory(GiB)": 113.5, "reward": 0.43333334028720855, "reward_std": 0.3840597689151764, "rewards/MultiModalAccuracyORM/mean": 0.43333334028720855, "rewards/MultiModalAccuracyORM/std": 0.3840597689151764, "step": 2380, "train_speed(iter/s)": 0.031761 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.016666666666666666, "completions/max_length": 708.0, "completions/mean_length": 293.06667556762693, "completions/min_length": 155.5, "epoch": 0.9636363636363636, "grad_norm": 2.126614857423257, "kl": 0.0335205078125, "learning_rate": 2e-07, "loss": 0.0018027305603027343, "memory(GiB)": 113.5, "reward": 0.416666679084301, "reward_std": 0.3855114609003067, "rewards/MultiModalAccuracyORM/mean": 0.416666679084301, "rewards/MultiModalAccuracyORM/std": 0.3855114609003067, "step": 2385, "train_speed(iter/s)": 0.031758 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.6, "completions/mean_length": 374.27501220703124, "completions/min_length": 226.1, "epoch": 0.9656565656565657, "grad_norm": 2.285791825740683, "kl": 0.03223876953125, "learning_rate": 2e-07, "loss": -0.007699564099311829, "memory(GiB)": 113.5, "reward": 0.14166667088866233, "reward_std": 0.3000969380140305, "rewards/MultiModalAccuracyORM/mean": 0.14166667088866233, "rewards/MultiModalAccuracyORM/std": 0.3000969380140305, "step": 2390, "train_speed(iter/s)": 0.031759 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.008333333333333333, "completions/max_length": 601.7, "completions/mean_length": 297.45834197998045, "completions/min_length": 152.0, "epoch": 0.9676767676767677, "grad_norm": 2.926559087104104, "kl": 0.0413818359375, "learning_rate": 2e-07, "loss": 0.04997736811637878, "memory(GiB)": 113.5, "reward": 0.31666667610406873, "reward_std": 0.3687034219503403, "rewards/MultiModalAccuracyORM/mean": 0.31666667610406873, "rewards/MultiModalAccuracyORM/std": 0.3687034219503403, "step": 2395, "train_speed(iter/s)": 0.031761 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 674.1, "completions/mean_length": 422.52500915527344, "completions/min_length": 260.5, "epoch": 0.9696969696969697, "grad_norm": 1.0391142999786047, "kl": 0.02857666015625, "learning_rate": 2e-07, "loss": -0.008375594019889831, "memory(GiB)": 113.5, "reward": 0.45000000670552254, "reward_std": 0.34407602846622465, "rewards/MultiModalAccuracyORM/mean": 0.45000000670552254, "rewards/MultiModalAccuracyORM/std": 0.34407602846622465, "step": 2400, "train_speed(iter/s)": 0.031765 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.008333333333333333, "completions/max_length": 726.7, "completions/mean_length": 369.96667938232423, "completions/min_length": 166.7, "epoch": 0.9717171717171718, "grad_norm": 1.9044448293066447, "kl": 0.034771728515625, "learning_rate": 2e-07, "loss": 0.041448038816452024, "memory(GiB)": 113.5, "reward": 0.3666666761040688, "reward_std": 0.3330695390701294, "rewards/MultiModalAccuracyORM/mean": 0.3666666761040688, "rewards/MultiModalAccuracyORM/std": 0.3330695390701294, "step": 2405, "train_speed(iter/s)": 0.031765 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.016666666666666666, "completions/max_length": 860.8, "completions/mean_length": 442.18335113525393, "completions/min_length": 199.9, "epoch": 0.9737373737373738, "grad_norm": 1.1043100849775993, "kl": 0.0294525146484375, "learning_rate": 2e-07, "loss": 0.011988846212625503, "memory(GiB)": 113.5, "reward": 0.3166666731238365, "reward_std": 0.383000972867012, "rewards/MultiModalAccuracyORM/mean": 0.3166666731238365, "rewards/MultiModalAccuracyORM/std": 0.383000972867012, "step": 2410, "train_speed(iter/s)": 0.031763 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03333333333333333, "completions/max_length": 964.9, "completions/mean_length": 423.55834503173827, "completions/min_length": 182.4, "epoch": 0.9757575757575757, "grad_norm": 3.0380086133675968, "kl": 0.037750244140625, "learning_rate": 2e-07, "loss": 0.02129605710506439, "memory(GiB)": 113.5, "reward": 0.5166666716337204, "reward_std": 0.2104335606098175, "rewards/MultiModalAccuracyORM/mean": 0.5166666716337204, "rewards/MultiModalAccuracyORM/std": 0.2104335606098175, "step": 2415, "train_speed(iter/s)": 0.031748 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/mean_length": 316.23334274291994, "completions/min_length": 185.4, "epoch": 0.9777777777777777, "grad_norm": 3.286741279330765, "kl": 0.03631591796875, "learning_rate": 2e-07, "loss": 0.01842118501663208, "memory(GiB)": 113.5, "reward": 0.4916666768491268, "reward_std": 0.3266936391592026, "rewards/MultiModalAccuracyORM/mean": 0.4916666768491268, "rewards/MultiModalAccuracyORM/std": 0.3266936391592026, "step": 2420, "train_speed(iter/s)": 0.031765 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.4, "completions/mean_length": 382.6333435058594, "completions/min_length": 208.4, "epoch": 0.9797979797979798, "grad_norm": 3.1686862418479125, "kl": 0.05205078125, "learning_rate": 2e-07, "loss": 0.011619596928358077, "memory(GiB)": 113.5, "reward": 0.3166666731238365, "reward_std": 0.32526837289333344, "rewards/MultiModalAccuracyORM/mean": 0.3166666731238365, "rewards/MultiModalAccuracyORM/std": 0.32526837289333344, "step": 2425, "train_speed(iter/s)": 0.031766 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.9, "completions/mean_length": 343.9250144958496, "completions/min_length": 189.7, "epoch": 0.9818181818181818, "grad_norm": 1.5585214145494302, "kl": 0.034375, "learning_rate": 2e-07, "loss": 0.0014587238430976868, "memory(GiB)": 113.5, "reward": 0.17500000596046447, "reward_std": 0.3244759202003479, "rewards/MultiModalAccuracyORM/mean": 0.17500000596046447, "rewards/MultiModalAccuracyORM/std": 0.3244759202003479, "step": 2430, "train_speed(iter/s)": 0.031779 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.2, "completions/mean_length": 378.7916778564453, "completions/min_length": 209.2, "epoch": 0.9838383838383838, "grad_norm": 2.73232643290958, "kl": 0.04532470703125, "learning_rate": 2e-07, "loss": 0.062485653162002566, "memory(GiB)": 113.5, "reward": 0.4666666828095913, "reward_std": 0.4470617562532425, "rewards/MultiModalAccuracyORM/mean": 0.4666666828095913, "rewards/MultiModalAccuracyORM/std": 0.4470617562532425, "step": 2435, "train_speed(iter/s)": 0.031786 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.008333333333333333, "completions/max_length": 738.5, "completions/mean_length": 341.6166717529297, "completions/min_length": 163.9, "epoch": 0.9858585858585859, "grad_norm": 1.3853546154126857, "kl": 0.0336669921875, "learning_rate": 2e-07, "loss": -0.028276541829109193, "memory(GiB)": 113.5, "reward": 0.14166667088866233, "reward_std": 0.3000969380140305, "rewards/MultiModalAccuracyORM/mean": 0.14166667088866233, "rewards/MultiModalAccuracyORM/std": 0.3000969380140305, "step": 2440, "train_speed(iter/s)": 0.03179 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03333333333333333, "completions/max_length": 914.9, "completions/mean_length": 449.3500152587891, "completions/min_length": 241.3, "epoch": 0.9878787878787879, "grad_norm": 2.2456582209413893, "kl": 0.03409423828125, "learning_rate": 2e-07, "loss": -0.0006516605615615844, "memory(GiB)": 113.5, "reward": 0.2083333358168602, "reward_std": 0.32050161957740786, "rewards/MultiModalAccuracyORM/mean": 0.2083333358168602, "rewards/MultiModalAccuracyORM/std": 0.32050161957740786, "step": 2445, "train_speed(iter/s)": 0.031782 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.008333333333333333, "completions/max_length": 626.9, "completions/mean_length": 322.7333419799805, "completions/min_length": 203.0, "epoch": 0.98989898989899, "grad_norm": 2.6524648003013103, "kl": 0.03165283203125, "learning_rate": 2e-07, "loss": -0.008733100444078445, "memory(GiB)": 113.5, "reward": 0.6416666708886624, "reward_std": 0.15824586153030396, "rewards/MultiModalAccuracyORM/mean": 0.6416666708886624, "rewards/MultiModalAccuracyORM/std": 0.15824586153030396, "step": 2450, "train_speed(iter/s)": 0.031782 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.016666666666666666, "completions/max_length": 881.4, "completions/mean_length": 330.15001068115237, "completions/min_length": 158.1, "epoch": 0.9919191919191919, "grad_norm": 1.7910216600269697, "kl": 0.05638427734375, "learning_rate": 2e-07, "loss": -0.0065705299377441405, "memory(GiB)": 113.5, "reward": 0.2333333395421505, "reward_std": 0.2815410792827606, "rewards/MultiModalAccuracyORM/mean": 0.2333333395421505, "rewards/MultiModalAccuracyORM/std": 0.2815410792827606, "step": 2455, "train_speed(iter/s)": 0.031777 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.016666666666666666, "completions/max_length": 851.2, "completions/mean_length": 378.46667633056643, "completions/min_length": 185.4, "epoch": 0.9939393939393939, "grad_norm": 2.5045509391063856, "kl": 0.04505615234375, "learning_rate": 2e-07, "loss": -0.008945465087890625, "memory(GiB)": 113.5, "reward": 0.40833334252238274, "reward_std": 0.3794672876596451, "rewards/MultiModalAccuracyORM/mean": 0.40833334252238274, "rewards/MultiModalAccuracyORM/std": 0.3794672876596451, "step": 2460, "train_speed(iter/s)": 0.031779 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03333333333333333, "completions/max_length": 879.8, "completions/mean_length": 371.80834045410154, "completions/min_length": 158.4, "epoch": 0.9959595959595959, "grad_norm": 3.572517250532036, "kl": 0.051068115234375, "learning_rate": 2e-07, "loss": -0.013737475872039795, "memory(GiB)": 113.5, "reward": 0.31666667088866235, "reward_std": 0.29408499896526336, "rewards/MultiModalAccuracyORM/mean": 0.31666667088866235, "rewards/MultiModalAccuracyORM/std": 0.29408499896526336, "step": 2465, "train_speed(iter/s)": 0.031766 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.03333333333333333, "completions/max_length": 1149.5, "completions/mean_length": 445.8500122070312, "completions/min_length": 184.4, "epoch": 0.997979797979798, "grad_norm": 2.2904897334575254, "kl": 0.04656982421875, "learning_rate": 2e-07, "loss": -0.032226094603538515, "memory(GiB)": 113.5, "reward": 0.3416666775941849, "reward_std": 0.4094175934791565, "rewards/MultiModalAccuracyORM/mean": 0.3416666775941849, "rewards/MultiModalAccuracyORM/std": 0.4094175934791565, "step": 2470, "train_speed(iter/s)": 0.031754 }, { "epoch": 1.0, "grad_norm": 1.4731764005283963, "learning_rate": 2e-07, "loss": 0.061235594749450686, "memory(GiB)": 113.5, "step": 2475, "train_speed(iter/s)": 0.031746 }, { "epoch": 1.0, "eval_clip_ratio": 0.0, "eval_completions/clipped_ratio": 0.018333333333333333, "eval_completions/max_length": 787.14, "eval_completions/mean_length": 378.51834548950194, "eval_completions/min_length": 186.72, "eval_kl": 0.040185546875, "eval_loss": 0.029814261943101883, "eval_reward": 0.3483333396911621, "eval_reward_std": 0.3004326641559601, "eval_rewards/MultiModalAccuracyORM/mean": 0.3483333396911621, "eval_rewards/MultiModalAccuracyORM/std": 0.3004326641559601, "eval_runtime": 729.694, "eval_samples_per_second": 0.069, "eval_steps_per_second": 0.007, "step": 2475 } ], "logging_steps": 5, "max_steps": 2475, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 3, "trial_name": null, "trial_params": null }