{ "best_metric": 0.4650000059604645, "best_model_checkpoint": "/mnt/data/user/zhao_jun/tangjixin/output/model/llava_ov-grpo_new_v20_5k/v8-20250330-101445/checkpoint-2475", "epoch": 1.0, "eval_steps": 250, "global_step": 2475, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 12.833333730697632, "epoch": 0.00040404040404040404, "grad_norm": 4.95974063873291, "kl": 0.0007257461547851562, "learning_rate": 1.6129032258064515e-09, "loss": 0.1313462257385254, "memory(GiB)": 103.91, "response_clip_ratio": 0.0, "reward": 0.125, "reward_std": 0.22613351047039032, "rewards/MultiModalAccuracyORM": 0.125, "step": 1, "train_speed(iter/s)": 0.011139 }, { "clip_ratio": 0.0, "completion_length": 45.927083522081375, "epoch": 0.00202020202020202, "grad_norm": 0.026089413091540337, "kl": 0.00024419277906417847, "learning_rate": 8.064516129032257e-09, "loss": -0.0017255048733204603, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.0416666679084301, "reward_std": 0.09731236100196838, "rewards/MultiModalAccuracyORM": 0.0416666679084301, "step": 5, "train_speed(iter/s)": 0.028079 }, { "clip_ratio": 0.0, "completion_length": 33.066667795181274, "epoch": 0.00404040404040404, "grad_norm": 4.474486827850342, "kl": 4.897117614746094e-05, "learning_rate": 1.6129032258064514e-08, "loss": 0.005788012593984604, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.1250000014901161, "reward_std": 0.2712650209665298, "rewards/MultiModalAccuracyORM": 0.1250000014901161, "step": 10, "train_speed(iter/s)": 0.034795 }, { "clip_ratio": 0.0, "completion_length": 39.85000114440918, "epoch": 0.006060606060606061, "grad_norm": 2.904900074005127, "kl": 0.00015695095062255858, "learning_rate": 2.4193548387096773e-08, "loss": 0.036757296323776244, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2416666716337204, "reward_std": 0.29389037787914274, "rewards/MultiModalAccuracyORM": 0.2416666716337204, "step": 15, "train_speed(iter/s)": 0.0376 }, { "clip_ratio": 0.0, "completion_length": 69.10000429153442, "epoch": 0.00808080808080808, "grad_norm": 1.9090512990951538, "kl": 0.00022979974746704102, "learning_rate": 3.225806451612903e-08, "loss": 0.00942036360502243, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.16666667088866233, "reward_std": 0.31846399009227755, "rewards/MultiModalAccuracyORM": 0.16666667088866233, "step": 20, "train_speed(iter/s)": 0.03857 }, { "clip_ratio": 0.0, "completion_length": 34.64166686534882, "epoch": 0.010101010101010102, "grad_norm": 23.398836135864258, "kl": 0.00027928352355957033, "learning_rate": 4.032258064516129e-08, "loss": -0.005109664052724838, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2083333410322666, "reward_std": 0.31046820282936094, "rewards/MultiModalAccuracyORM": 0.2083333410322666, "step": 25, "train_speed(iter/s)": 0.039527 }, { "clip_ratio": 0.0, "completion_length": 27.183334088325502, "epoch": 0.012121212121212121, "grad_norm": 0.027309712022542953, "kl": 0.0002372264862060547, "learning_rate": 4.8387096774193546e-08, "loss": -0.016541659832000732, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.05833333432674408, "reward_std": 0.14188667237758637, "rewards/MultiModalAccuracyORM": 0.05833333432674408, "step": 30, "train_speed(iter/s)": 0.040173 }, { "clip_ratio": 0.0, "completion_length": 48.750002241134645, "epoch": 0.014141414141414142, "grad_norm": 2.6486644744873047, "kl": 0.00022208690643310547, "learning_rate": 5.645161290322581e-08, "loss": 0.03488517701625824, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.1416666716337204, "reward_std": 0.19962169826030732, "rewards/MultiModalAccuracyORM": 0.1416666716337204, "step": 35, "train_speed(iter/s)": 0.040888 }, { "clip_ratio": 0.0, "completion_length": 7.7666668176651, "epoch": 0.01616161616161616, "grad_norm": 13.41940689086914, "kl": 0.00021257400512695313, "learning_rate": 6.451612903225806e-08, "loss": -0.0012449542991816998, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.12500000223517418, "reward_std": 0.2652174890041351, "rewards/MultiModalAccuracyORM": 0.12500000223517418, "step": 40, "train_speed(iter/s)": 0.041651 }, { "clip_ratio": 0.0, "completion_length": 65.25000057220458, "epoch": 0.01818181818181818, "grad_norm": 11.40164852142334, "kl": 5.4210424423217773e-05, "learning_rate": 7.258064516129032e-08, "loss": 0.03769001364707947, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.21666667386889457, "reward_std": 0.325963220000267, "rewards/MultiModalAccuracyORM": 0.21666667386889457, "step": 45, "train_speed(iter/s)": 0.041539 }, { "clip_ratio": 0.0, "completion_length": 32.84166791439056, "epoch": 0.020202020202020204, "grad_norm": 0.03606203943490982, "kl": 0.00031108856201171874, "learning_rate": 8.064516129032257e-08, "loss": 1.2442469596862793e-05, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.07500000074505805, "reward_std": 0.15824586153030396, "rewards/MultiModalAccuracyORM": 0.07500000074505805, "step": 50, "train_speed(iter/s)": 0.041821 }, { "clip_ratio": 0.0, "completion_length": 20.025000762939452, "epoch": 0.022222222222222223, "grad_norm": 3.2404561042785645, "kl": 0.0004961967468261718, "learning_rate": 8.870967741935484e-08, "loss": 0.016841122508049013, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.24166666865348815, "reward_std": 0.3241831511259079, "rewards/MultiModalAccuracyORM": 0.24166666865348815, "step": 55, "train_speed(iter/s)": 0.042244 }, { "clip_ratio": 0.0, "completion_length": 7.516666769981384, "epoch": 0.024242424242424242, "grad_norm": 3.8046255111694336, "kl": 6.520748138427735e-06, "learning_rate": 9.677419354838709e-08, "loss": -0.001297527551651001, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.36666668206453323, "reward_std": 0.330777695775032, "rewards/MultiModalAccuracyORM": 0.36666668206453323, "step": 60, "train_speed(iter/s)": 0.042408 }, { "clip_ratio": 0.0, "completion_length": 9.5333336353302, "epoch": 0.026262626262626262, "grad_norm": 0.015074208378791809, "kl": 0.00015583038330078126, "learning_rate": 1.0483870967741934e-07, "loss": -0.018772208690643312, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2083333395421505, "reward_std": 0.3019101768732071, "rewards/MultiModalAccuracyORM": 0.2083333395421505, "step": 65, "train_speed(iter/s)": 0.04265 }, { "clip_ratio": 0.0, "completion_length": 14.125000405311585, "epoch": 0.028282828282828285, "grad_norm": 1.4802911281585693, "kl": 0.0001938343048095703, "learning_rate": 1.1290322580645162e-07, "loss": 0.04349477887153626, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.15833333805203437, "reward_std": 0.26123160123825073, "rewards/MultiModalAccuracyORM": 0.15833333805203437, "step": 70, "train_speed(iter/s)": 0.042774 }, { "clip_ratio": 0.0, "completion_length": 9.00833351612091, "epoch": 0.030303030303030304, "grad_norm": 17.15009880065918, "kl": 0.0005457401275634766, "learning_rate": 1.2096774193548387e-07, "loss": -0.03085809648036957, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.20000000298023224, "reward_std": 0.2855865776538849, "rewards/MultiModalAccuracyORM": 0.20000000298023224, "step": 75, "train_speed(iter/s)": 0.043032 }, { "clip_ratio": 0.0, "completion_length": 30.941667556762695, "epoch": 0.03232323232323232, "grad_norm": 0.15290312469005585, "kl": 0.0005632162094116211, "learning_rate": 1.2903225806451611e-07, "loss": -0.019948795437812805, "memory(GiB)": 104.49, "response_clip_ratio": 0.00833333358168602, "reward": 0.15000000447034836, "reward_std": 0.2066778928041458, "rewards/MultiModalAccuracyORM": 0.15000000447034836, "step": 80, "train_speed(iter/s)": 0.042552 }, { "clip_ratio": 0.0, "completion_length": 13.350000309944154, "epoch": 0.03434343434343434, "grad_norm": 10.242753028869629, "kl": 0.0002181917428970337, "learning_rate": 1.3709677419354838e-07, "loss": -0.0021827301010489465, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.20833333805203438, "reward_std": 0.36318225264549253, "rewards/MultiModalAccuracyORM": 0.20833333805203438, "step": 85, "train_speed(iter/s)": 0.042776 }, { "clip_ratio": 0.0, "completion_length": 31.70833353996277, "epoch": 0.03636363636363636, "grad_norm": 18.3216552734375, "kl": 0.00013442039489746093, "learning_rate": 1.4516129032258064e-07, "loss": -0.014865413308143616, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.17500000521540643, "reward_std": 0.19786564111709595, "rewards/MultiModalAccuracyORM": 0.17500000521540643, "step": 90, "train_speed(iter/s)": 0.042668 }, { "clip_ratio": 0.0, "completion_length": 12.166666793823243, "epoch": 0.03838383838383838, "grad_norm": 2.986149311065674, "kl": 0.00017652511596679687, "learning_rate": 1.5322580645161288e-07, "loss": -0.004295501857995987, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.0833333358168602, "reward_std": 0.18482151627540588, "rewards/MultiModalAccuracyORM": 0.0833333358168602, "step": 95, "train_speed(iter/s)": 0.042663 }, { "clip_ratio": 0.0, "completion_length": 37.32500224113464, "epoch": 0.04040404040404041, "grad_norm": 9.087557792663574, "kl": 0.00025534629821777344, "learning_rate": 1.6129032258064515e-07, "loss": -0.042690178751945494, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.24166667088866234, "reward_std": 0.3192540168762207, "rewards/MultiModalAccuracyORM": 0.24166667088866234, "step": 100, "train_speed(iter/s)": 0.042723 }, { "clip_ratio": 0.0, "completion_length": 42.64166672229767, "epoch": 0.04242424242424243, "grad_norm": 1.299012303352356, "kl": 0.000713956356048584, "learning_rate": 1.6935483870967741e-07, "loss": -0.01074601411819458, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.1500000014901161, "reward_std": 0.2782616138458252, "rewards/MultiModalAccuracyORM": 0.1500000014901161, "step": 105, "train_speed(iter/s)": 0.042694 }, { "clip_ratio": 0.0, "completion_length": 25.308334159851075, "epoch": 0.044444444444444446, "grad_norm": 20.200790405273438, "kl": -2.079010009765625e-05, "learning_rate": 1.7741935483870968e-07, "loss": -0.0049890361726284025, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.17500000447034836, "reward_std": 0.34557787179946897, "rewards/MultiModalAccuracyORM": 0.17500000447034836, "step": 110, "train_speed(iter/s)": 0.042795 }, { "clip_ratio": 0.0, "completion_length": 17.325000619888307, "epoch": 0.046464646464646465, "grad_norm": 2.473445177078247, "kl": 0.0003504753112792969, "learning_rate": 1.8548387096774192e-07, "loss": 0.009455542266368865, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.15833333805203437, "reward_std": 0.2629852324724197, "rewards/MultiModalAccuracyORM": 0.15833333805203437, "step": 115, "train_speed(iter/s)": 0.042806 }, { "clip_ratio": 0.0, "completion_length": 17.07500042915344, "epoch": 0.048484848484848485, "grad_norm": 18.782503128051758, "kl": 0.00040736198425292967, "learning_rate": 1.9354838709677418e-07, "loss": -0.00938464030623436, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.10000000149011612, "reward_std": 0.17861495018005372, "rewards/MultiModalAccuracyORM": 0.10000000149011612, "step": 120, "train_speed(iter/s)": 0.042915 }, { "clip_ratio": 0.0, "completion_length": 40.666668796539305, "epoch": 0.050505050505050504, "grad_norm": 10.809483528137207, "kl": 0.00013909339904785156, "learning_rate": 2e-07, "loss": 0.015682700276374816, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3083333417773247, "reward_std": 0.2325587123632431, "rewards/MultiModalAccuracyORM": 0.3083333417773247, "step": 125, "train_speed(iter/s)": 0.042982 }, { "clip_ratio": 0.0, "completion_length": 21.79166784286499, "epoch": 0.052525252525252523, "grad_norm": 0.059968430548906326, "kl": 0.0003565549850463867, "learning_rate": 2e-07, "loss": -0.012978824973106384, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.20833334028720857, "reward_std": 0.2775311887264252, "rewards/MultiModalAccuracyORM": 0.20833334028720857, "step": 130, "train_speed(iter/s)": 0.043138 }, { "clip_ratio": 0.0, "completion_length": 9.358333635330201, "epoch": 0.05454545454545454, "grad_norm": 16.368749618530273, "kl": 0.0005423665046691894, "learning_rate": 2e-07, "loss": -0.018562111258506774, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2083333432674408, "reward_std": 0.3227223068475723, "rewards/MultiModalAccuracyORM": 0.2083333432674408, "step": 135, "train_speed(iter/s)": 0.043281 }, { "clip_ratio": 0.0, "completion_length": 46.30833601951599, "epoch": 0.05656565656565657, "grad_norm": 8.052789688110352, "kl": 0.0008988380432128906, "learning_rate": 2e-07, "loss": 0.05945103764533997, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.20833333879709243, "reward_std": 0.32900004684925077, "rewards/MultiModalAccuracyORM": 0.20833333879709243, "step": 140, "train_speed(iter/s)": 0.043225 }, { "clip_ratio": 0.0, "completion_length": 4.983333492279053, "epoch": 0.05858585858585859, "grad_norm": 5.5169525146484375, "kl": 0.0008536338806152344, "learning_rate": 2e-07, "loss": -0.03663218915462494, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2000000074505806, "reward_std": 0.29079394936561587, "rewards/MultiModalAccuracyORM": 0.2000000074505806, "step": 145, "train_speed(iter/s)": 0.043361 }, { "clip_ratio": 0.0, "completion_length": 7.125000166893005, "epoch": 0.06060606060606061, "grad_norm": 0.07958526909351349, "kl": 0.001511383056640625, "learning_rate": 2e-07, "loss": 0.05411055088043213, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2333333373069763, "reward_std": 0.27122943103313446, "rewards/MultiModalAccuracyORM": 0.2333333373069763, "step": 150, "train_speed(iter/s)": 0.043443 }, { "clip_ratio": 0.0, "completion_length": 7.666666889190674, "epoch": 0.06262626262626263, "grad_norm": 0.0961478129029274, "kl": 0.0021147727966308594, "learning_rate": 2e-07, "loss": 0.0017779668793082236, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.20000000447034835, "reward_std": 0.22052658796310426, "rewards/MultiModalAccuracyORM": 0.20000000447034835, "step": 155, "train_speed(iter/s)": 0.043434 }, { "clip_ratio": 0.0, "completion_length": 56.125001430511475, "epoch": 0.06464646464646465, "grad_norm": 3.5018489360809326, "kl": 0.0011393070220947266, "learning_rate": 2e-07, "loss": 0.003215038776397705, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.1833333395421505, "reward_std": 0.2687189429998398, "rewards/MultiModalAccuracyORM": 0.1833333395421505, "step": 160, "train_speed(iter/s)": 0.043475 }, { "clip_ratio": 0.0, "completion_length": 33.183334255218504, "epoch": 0.06666666666666667, "grad_norm": 1.7839807271957397, "kl": 0.001880502700805664, "learning_rate": 2e-07, "loss": 0.037510618567466736, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.1250000037252903, "reward_std": 0.2629256367683411, "rewards/MultiModalAccuracyORM": 0.1250000037252903, "step": 165, "train_speed(iter/s)": 0.04338 }, { "clip_ratio": 0.0, "completion_length": 12.583333587646484, "epoch": 0.06868686868686869, "grad_norm": 2.9806480407714844, "kl": 0.001198887825012207, "learning_rate": 2e-07, "loss": 0.007929786294698715, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.19166667461395265, "reward_std": 0.21750431060791015, "rewards/MultiModalAccuracyORM": 0.19166667461395265, "step": 170, "train_speed(iter/s)": 0.043348 }, { "clip_ratio": 0.0, "completion_length": 10.308333587646484, "epoch": 0.0707070707070707, "grad_norm": 0.006374528165906668, "kl": 0.008016198873519897, "learning_rate": 2e-07, "loss": 0.0161195233464241, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.19166667014360428, "reward_std": 0.2822715133428574, "rewards/MultiModalAccuracyORM": 0.19166667014360428, "step": 175, "train_speed(iter/s)": 0.043522 }, { "clip_ratio": 0.0, "completion_length": 17.283334064483643, "epoch": 0.07272727272727272, "grad_norm": 13.373006820678711, "kl": 0.005344104766845703, "learning_rate": 2e-07, "loss": 0.005642924830317498, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.1250000037252903, "reward_std": 0.2629256367683411, "rewards/MultiModalAccuracyORM": 0.1250000037252903, "step": 180, "train_speed(iter/s)": 0.043562 }, { "clip_ratio": 0.0, "completion_length": 7.858333396911621, "epoch": 0.07474747474747474, "grad_norm": 20.940757751464844, "kl": 0.004119682312011719, "learning_rate": 2e-07, "loss": -0.014204351603984833, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.21666667312383653, "reward_std": 0.24560283720493317, "rewards/MultiModalAccuracyORM": 0.21666667312383653, "step": 185, "train_speed(iter/s)": 0.043604 }, { "clip_ratio": 0.0, "completion_length": 10.808333730697631, "epoch": 0.07676767676767676, "grad_norm": 1.9175783395767212, "kl": 0.0015784263610839843, "learning_rate": 2e-07, "loss": 0.036653178930282596, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2583333395421505, "reward_std": 0.27774982452392577, "rewards/MultiModalAccuracyORM": 0.2583333395421505, "step": 190, "train_speed(iter/s)": 0.043708 }, { "clip_ratio": 0.0, "completion_length": 8.058333468437194, "epoch": 0.07878787878787878, "grad_norm": 20.731929779052734, "kl": 0.002748870849609375, "learning_rate": 2e-07, "loss": -0.007462918758392334, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.24166667461395264, "reward_std": 0.26047474443912505, "rewards/MultiModalAccuracyORM": 0.24166667461395264, "step": 195, "train_speed(iter/s)": 0.043797 }, { "clip_ratio": 0.0, "completion_length": 39.766668224334715, "epoch": 0.08080808080808081, "grad_norm": 32.81786346435547, "kl": 0.012819027900695801, "learning_rate": 2e-07, "loss": -0.012741921842098236, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.1916666731238365, "reward_std": 0.3634008765220642, "rewards/MultiModalAccuracyORM": 0.1916666731238365, "step": 200, "train_speed(iter/s)": 0.043791 }, { "clip_ratio": 0.0, "completion_length": 7.191666889190674, "epoch": 0.08282828282828283, "grad_norm": 10.631654739379883, "kl": 0.007097434997558594, "learning_rate": 2e-07, "loss": -0.059709519147872925, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.15000000223517418, "reward_std": 0.26302082240581515, "rewards/MultiModalAccuracyORM": 0.15000000223517418, "step": 205, "train_speed(iter/s)": 0.043862 }, { "clip_ratio": 0.0, "completion_length": 10.100000143051147, "epoch": 0.08484848484848485, "grad_norm": 15.135857582092285, "kl": 0.016997623443603515, "learning_rate": 2e-07, "loss": 0.036284705996513365, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.10833333730697632, "reward_std": 0.24481281042098998, "rewards/MultiModalAccuracyORM": 0.10833333730697632, "step": 210, "train_speed(iter/s)": 0.043845 }, { "clip_ratio": 0.0, "completion_length": 10.075000190734864, "epoch": 0.08686868686868687, "grad_norm": 15.046256065368652, "kl": 0.013745307922363281, "learning_rate": 2e-07, "loss": -0.01842743158340454, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3166666753590107, "reward_std": 0.3001325339078903, "rewards/MultiModalAccuracyORM": 0.3166666753590107, "step": 215, "train_speed(iter/s)": 0.043911 }, { "clip_ratio": 0.0, "completion_length": 31.94166750907898, "epoch": 0.08888888888888889, "grad_norm": 14.397719383239746, "kl": 0.01525421142578125, "learning_rate": 2e-07, "loss": -0.016506943106651305, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.22500000894069672, "reward_std": 0.24662604331970214, "rewards/MultiModalAccuracyORM": 0.22500000894069672, "step": 220, "train_speed(iter/s)": 0.043992 }, { "clip_ratio": 0.0, "completion_length": 5.44166669845581, "epoch": 0.09090909090909091, "grad_norm": 12.164202690124512, "kl": 0.025649261474609376, "learning_rate": 2e-07, "loss": 0.017044636607170104, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3000000074505806, "reward_std": 0.28160068988800047, "rewards/MultiModalAccuracyORM": 0.3000000074505806, "step": 225, "train_speed(iter/s)": 0.044113 }, { "clip_ratio": 0.0, "completion_length": 39.383334040641785, "epoch": 0.09292929292929293, "grad_norm": 21.127038955688477, "kl": 0.024017763137817384, "learning_rate": 2e-07, "loss": 0.02930714190006256, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.1666666693985462, "reward_std": 0.26196202635765076, "rewards/MultiModalAccuracyORM": 0.1666666693985462, "step": 230, "train_speed(iter/s)": 0.044142 }, { "clip_ratio": 0.0, "completion_length": 14.891666889190674, "epoch": 0.09494949494949495, "grad_norm": 6.2940568923950195, "kl": 0.027823114395141603, "learning_rate": 2e-07, "loss": -0.009951599687337876, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.17500000521540643, "reward_std": 0.3036638140678406, "rewards/MultiModalAccuracyORM": 0.17500000521540643, "step": 235, "train_speed(iter/s)": 0.044213 }, { "clip_ratio": 0.0, "completion_length": 6.750000047683716, "epoch": 0.09696969696969697, "grad_norm": 3.980544090270996, "kl": 0.018259000778198243, "learning_rate": 2e-07, "loss": -0.020673815906047822, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.16666667014360428, "reward_std": 0.21823472976684571, "rewards/MultiModalAccuracyORM": 0.16666667014360428, "step": 240, "train_speed(iter/s)": 0.044233 }, { "clip_ratio": 0.0, "completion_length": 10.80000023841858, "epoch": 0.09898989898989899, "grad_norm": 1.3881502151489258, "kl": 0.000605630874633789, "learning_rate": 2e-07, "loss": -0.01487920731306076, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.28333334177732467, "reward_std": 0.30661733746528624, "rewards/MultiModalAccuracyORM": 0.28333334177732467, "step": 245, "train_speed(iter/s)": 0.044235 }, { "epoch": 0.10101010101010101, "grad_norm": 11.512455940246582, "learning_rate": 2e-07, "loss": 0.033054867386817934, "memory(GiB)": 104.49, "step": 250, "train_speed(iter/s)": 0.044081 }, { "epoch": 0.10101010101010101, "eval_clip_ratio": 0.0, "eval_completion_length": 24.26333417892456, "eval_kl": 0.022986836433410644, "eval_loss": 0.027694934979081154, "eval_response_clip_ratio": 0.0, "eval_reward": 0.2150000040233135, "eval_reward_std": 0.2852368396520615, "eval_rewards/MultiModalAccuracyORM": 0.2150000040233135, "eval_runtime": 262.2909, "eval_samples_per_second": 0.191, "eval_steps_per_second": 0.019, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 46.133334922790525, "epoch": 0.10303030303030303, "grad_norm": 4.130315780639648, "kl": 0.018082523345947267, "learning_rate": 2e-07, "loss": 0.024475347995758057, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.24166666977107526, "reward_std": 0.2766233593225479, "rewards/MultiModalAccuracyORM": 0.24166666977107526, "step": 255, "train_speed(iter/s)": 0.041648 }, { "clip_ratio": 0.0, "completion_length": 6.6, "epoch": 0.10505050505050505, "grad_norm": 10.52556324005127, "kl": 0.020127105712890624, "learning_rate": 2e-07, "loss": -0.008974193781614303, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2750000037252903, "reward_std": 0.2567190647125244, "rewards/MultiModalAccuracyORM": 0.2750000037252903, "step": 260, "train_speed(iter/s)": 0.041738 }, { "clip_ratio": 0.0, "completion_length": 6.45, "epoch": 0.10707070707070707, "grad_norm": 11.179485321044922, "kl": 0.03880462646484375, "learning_rate": 2e-07, "loss": 0.0015405803918838502, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.1666666716337204, "reward_std": 0.2918527454137802, "rewards/MultiModalAccuracyORM": 0.1666666716337204, "step": 265, "train_speed(iter/s)": 0.041756 }, { "clip_ratio": 0.0, "completion_length": 18.75, "epoch": 0.10909090909090909, "grad_norm": 4.639992713928223, "kl": 0.018306541442871093, "learning_rate": 2e-07, "loss": -0.012826296687126159, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.10833333656191826, "reward_std": 0.174764084815979, "rewards/MultiModalAccuracyORM": 0.10833333656191826, "step": 270, "train_speed(iter/s)": 0.041759 }, { "clip_ratio": 0.0, "completion_length": 117.6, "epoch": 0.1111111111111111, "grad_norm": 14.52376651763916, "kl": 0.02277069091796875, "learning_rate": 2e-07, "loss": -0.03760814070701599, "memory(GiB)": 104.49, "response_clip_ratio": 0.05, "reward": 0.30833334028720855, "reward_std": 0.3679845929145813, "rewards/MultiModalAccuracyORM": 0.30833334028720855, "step": 275, "train_speed(iter/s)": 0.041762 }, { "clip_ratio": 0.0, "completion_length": 41.5, "epoch": 0.11313131313131314, "grad_norm": 7.044532775878906, "kl": 0.04247570037841797, "learning_rate": 2e-07, "loss": 0.05246252417564392, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.22500000670552253, "reward_std": 0.30385262966156007, "rewards/MultiModalAccuracyORM": 0.22500000670552253, "step": 280, "train_speed(iter/s)": 0.041745 }, { "clip_ratio": 0.0, "completion_length": 23.9, "epoch": 0.11515151515151516, "grad_norm": 3.5612969398498535, "kl": 0.04666891098022461, "learning_rate": 2e-07, "loss": -0.03580006957054138, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.14166667312383652, "reward_std": 0.20594746768474578, "rewards/MultiModalAccuracyORM": 0.14166667312383652, "step": 285, "train_speed(iter/s)": 0.041805 }, { "clip_ratio": 0.0, "completion_length": 57.5, "epoch": 0.11717171717171718, "grad_norm": 22.66056251525879, "kl": 0.0072917938232421875, "learning_rate": 2e-07, "loss": 0.030799278616905214, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.24166667386889457, "reward_std": 0.3523798406124115, "rewards/MultiModalAccuracyORM": 0.24166667386889457, "step": 290, "train_speed(iter/s)": 0.041794 }, { "clip_ratio": 0.0, "completion_length": 64.1, "epoch": 0.1191919191919192, "grad_norm": 16.353897094726562, "kl": 0.02278270721435547, "learning_rate": 2e-07, "loss": 0.0040659308433532715, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.07500000149011612, "reward_std": 0.16200153529644012, "rewards/MultiModalAccuracyORM": 0.07500000149011612, "step": 295, "train_speed(iter/s)": 0.041713 }, { "clip_ratio": 0.0, "completion_length": 26.0, "epoch": 0.12121212121212122, "grad_norm": 3.0584208965301514, "kl": 0.021613693237304686, "learning_rate": 2e-07, "loss": 0.015577539801597595, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.12500000447034837, "reward_std": 0.2175043046474457, "rewards/MultiModalAccuracyORM": 0.12500000447034837, "step": 300, "train_speed(iter/s)": 0.041708 }, { "clip_ratio": 0.0, "completion_length": 8.4, "epoch": 0.12323232323232323, "grad_norm": 2.683347225189209, "kl": 0.05754499435424805, "learning_rate": 2e-07, "loss": 0.0014399250969290734, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.13333333805203437, "reward_std": 0.24637182354927062, "rewards/MultiModalAccuracyORM": 0.13333333805203437, "step": 305, "train_speed(iter/s)": 0.041731 }, { "clip_ratio": 0.0, "completion_length": 11.0, "epoch": 0.12525252525252525, "grad_norm": 4.011137008666992, "kl": 0.003471851348876953, "learning_rate": 2e-07, "loss": -0.012657842040061951, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.12500000223517418, "reward_std": 0.17781037986278533, "rewards/MultiModalAccuracyORM": 0.12500000223517418, "step": 310, "train_speed(iter/s)": 0.041745 }, { "clip_ratio": 0.0, "completion_length": 14.4, "epoch": 0.12727272727272726, "grad_norm": 2.4296364784240723, "kl": 0.01938905715942383, "learning_rate": 2e-07, "loss": 0.023499640822410583, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.17500000149011613, "reward_std": 0.1808116167783737, "rewards/MultiModalAccuracyORM": 0.17500000149011613, "step": 315, "train_speed(iter/s)": 0.041811 }, { "clip_ratio": 0.0, "completion_length": 9.35, "epoch": 0.1292929292929293, "grad_norm": 1.5319490432739258, "kl": 0.023272895812988283, "learning_rate": 2e-07, "loss": -0.0005661348812282085, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.22500000298023223, "reward_std": 0.23860624432563782, "rewards/MultiModalAccuracyORM": 0.22500000298023223, "step": 320, "train_speed(iter/s)": 0.041846 }, { "clip_ratio": 0.0, "completion_length": 14.8, "epoch": 0.13131313131313133, "grad_norm": 28.09259605407715, "kl": 0.055776214599609374, "learning_rate": 2e-07, "loss": -0.00978400707244873, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.15833333507180214, "reward_std": 0.2785158395767212, "rewards/MultiModalAccuracyORM": 0.15833333507180214, "step": 325, "train_speed(iter/s)": 0.041894 }, { "clip_ratio": 0.0, "completion_length": 9.8, "epoch": 0.13333333333333333, "grad_norm": 5.655847072601318, "kl": 0.01194305419921875, "learning_rate": 2e-07, "loss": -0.023021923005580903, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2250000037252903, "reward_std": 0.242361918091774, "rewards/MultiModalAccuracyORM": 0.2250000037252903, "step": 330, "train_speed(iter/s)": 0.041922 }, { "clip_ratio": 0.0, "completion_length": 10.0, "epoch": 0.13535353535353536, "grad_norm": 16.269479751586914, "kl": 0.012023067474365235, "learning_rate": 2e-07, "loss": 0.009542696177959442, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2583333380520344, "reward_std": 0.4074155628681183, "rewards/MultiModalAccuracyORM": 0.2583333380520344, "step": 335, "train_speed(iter/s)": 0.041926 }, { "clip_ratio": 0.0, "completion_length": 9.1, "epoch": 0.13737373737373737, "grad_norm": 19.7489013671875, "kl": 0.041985511779785156, "learning_rate": 2e-07, "loss": -0.009631294012069701, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3250000111758709, "reward_std": 0.38227055966854095, "rewards/MultiModalAccuracyORM": 0.3250000111758709, "step": 340, "train_speed(iter/s)": 0.042003 }, { "clip_ratio": 0.0, "completion_length": 17.25, "epoch": 0.1393939393939394, "grad_norm": 25.704818725585938, "kl": 0.02933082580566406, "learning_rate": 2e-07, "loss": 0.005663518235087395, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.17500000596046447, "reward_std": 0.287842845916748, "rewards/MultiModalAccuracyORM": 0.17500000596046447, "step": 345, "train_speed(iter/s)": 0.042012 }, { "clip_ratio": 0.0, "completion_length": 25.0, "epoch": 0.1414141414141414, "grad_norm": 30.1114559173584, "kl": 0.010479164123535157, "learning_rate": 2e-07, "loss": 0.018732863664627075, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.12500000447034837, "reward_std": 0.2077010989189148, "rewards/MultiModalAccuracyORM": 0.12500000447034837, "step": 350, "train_speed(iter/s)": 0.041986 }, { "clip_ratio": 0.0, "completion_length": 18.65, "epoch": 0.14343434343434344, "grad_norm": 4.131731033325195, "kl": 0.03218498229980469, "learning_rate": 2e-07, "loss": 0.05048830509185791, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.1833333380520344, "reward_std": 0.22854881286621093, "rewards/MultiModalAccuracyORM": 0.1833333380520344, "step": 355, "train_speed(iter/s)": 0.041992 }, { "clip_ratio": 0.0, "completion_length": 5.9, "epoch": 0.14545454545454545, "grad_norm": 2.5443966388702393, "kl": 0.028252887725830077, "learning_rate": 2e-07, "loss": 0.011212460696697235, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3166666813194752, "reward_std": 0.3104326128959656, "rewards/MultiModalAccuracyORM": 0.3166666813194752, "step": 360, "train_speed(iter/s)": 0.042049 }, { "clip_ratio": 0.0, "completion_length": 52.05, "epoch": 0.14747474747474748, "grad_norm": 4.374809265136719, "kl": 0.024268913269042968, "learning_rate": 2e-07, "loss": -0.0001811852096579969, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.15000000447034836, "reward_std": 0.20544483065605162, "rewards/MultiModalAccuracyORM": 0.15000000447034836, "step": 365, "train_speed(iter/s)": 0.042035 }, { "clip_ratio": 0.0, "completion_length": 9.75, "epoch": 0.1494949494949495, "grad_norm": 16.779956817626953, "kl": 0.015867042541503906, "learning_rate": 2e-07, "loss": 0.022855284810066222, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.17500000447034836, "reward_std": 0.17529989182949066, "rewards/MultiModalAccuracyORM": 0.17500000447034836, "step": 370, "train_speed(iter/s)": 0.042043 }, { "clip_ratio": 0.0, "completion_length": 32.05, "epoch": 0.15151515151515152, "grad_norm": 1.799055576324463, "kl": 0.02576103210449219, "learning_rate": 2e-07, "loss": 0.03886341452598572, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.15000000670552255, "reward_std": 0.23481498062610626, "rewards/MultiModalAccuracyORM": 0.15000000670552255, "step": 375, "train_speed(iter/s)": 0.041993 }, { "clip_ratio": 0.0, "completion_length": 11.3, "epoch": 0.15353535353535352, "grad_norm": 14.809004783630371, "kl": 0.06607561111450196, "learning_rate": 2e-07, "loss": 0.02258915901184082, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.27500000447034834, "reward_std": 0.27759079039096834, "rewards/MultiModalAccuracyORM": 0.27500000447034834, "step": 380, "train_speed(iter/s)": 0.042034 }, { "clip_ratio": 0.0, "completion_length": 11.6, "epoch": 0.15555555555555556, "grad_norm": 4.855790138244629, "kl": 0.044758033752441403, "learning_rate": 2e-07, "loss": 0.006666116416454315, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.1916666731238365, "reward_std": 0.2877832442522049, "rewards/MultiModalAccuracyORM": 0.1916666731238365, "step": 385, "train_speed(iter/s)": 0.042053 }, { "clip_ratio": 0.0, "completion_length": 5.45, "epoch": 0.15757575757575756, "grad_norm": 3.650961399078369, "kl": 0.09126663208007812, "learning_rate": 2e-07, "loss": -0.006338779628276825, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2750000014901161, "reward_std": 0.22384165227413177, "rewards/MultiModalAccuracyORM": 0.2750000014901161, "step": 390, "train_speed(iter/s)": 0.04209 }, { "clip_ratio": 0.0, "completion_length": 21.7, "epoch": 0.1595959595959596, "grad_norm": 22.398860931396484, "kl": 0.05564393997192383, "learning_rate": 2e-07, "loss": 0.011527793109416961, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2666666716337204, "reward_std": 0.3385071337223053, "rewards/MultiModalAccuracyORM": 0.2666666716337204, "step": 395, "train_speed(iter/s)": 0.04213 }, { "clip_ratio": 0.0, "completion_length": 32.45, "epoch": 0.16161616161616163, "grad_norm": 3.777151346206665, "kl": 0.08077354431152343, "learning_rate": 2e-07, "loss": 0.02410067617893219, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.11666666939854622, "reward_std": 0.2687189429998398, "rewards/MultiModalAccuracyORM": 0.11666666939854622, "step": 400, "train_speed(iter/s)": 0.04213 }, { "clip_ratio": 0.0, "completion_length": 5.7, "epoch": 0.16363636363636364, "grad_norm": 6.114872455596924, "kl": 0.09431419372558594, "learning_rate": 2e-07, "loss": 0.02062232345342636, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2833333410322666, "reward_std": 0.384308198094368, "rewards/MultiModalAccuracyORM": 0.2833333410322666, "step": 405, "train_speed(iter/s)": 0.042217 }, { "clip_ratio": 0.0, "completion_length": 11.1, "epoch": 0.16565656565656567, "grad_norm": 2.8733115196228027, "kl": 0.07746734619140624, "learning_rate": 2e-07, "loss": 0.014683787524700165, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3500000074505806, "reward_std": 0.28160068988800047, "rewards/MultiModalAccuracyORM": 0.3500000074505806, "step": 410, "train_speed(iter/s)": 0.042237 }, { "clip_ratio": 0.0, "completion_length": 29.55, "epoch": 0.16767676767676767, "grad_norm": 1.103491187095642, "kl": 0.013630294799804687, "learning_rate": 2e-07, "loss": 0.031570857763290404, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.23333333805203438, "reward_std": 0.3222051203250885, "rewards/MultiModalAccuracyORM": 0.23333333805203438, "step": 415, "train_speed(iter/s)": 0.042253 }, { "clip_ratio": 0.0, "completion_length": 11.65, "epoch": 0.1696969696969697, "grad_norm": 19.609107971191406, "kl": 0.006585693359375, "learning_rate": 2e-07, "loss": 0.029933744668960573, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.29166666865348817, "reward_std": 0.2815766781568527, "rewards/MultiModalAccuracyORM": 0.29166666865348817, "step": 420, "train_speed(iter/s)": 0.042267 }, { "clip_ratio": 0.0, "completion_length": 33.85, "epoch": 0.1717171717171717, "grad_norm": 3.5567312240600586, "kl": 0.027184486389160156, "learning_rate": 2e-07, "loss": -0.008297288417816162, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3000000067055225, "reward_std": 0.3423224091529846, "rewards/MultiModalAccuracyORM": 0.3000000067055225, "step": 425, "train_speed(iter/s)": 0.042268 }, { "clip_ratio": 0.0, "completion_length": 40.6, "epoch": 0.17373737373737375, "grad_norm": 4.005617141723633, "kl": 0.037563323974609375, "learning_rate": 2e-07, "loss": -0.008759691566228866, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.22500000298023223, "reward_std": 0.2403598755598068, "rewards/MultiModalAccuracyORM": 0.22500000298023223, "step": 430, "train_speed(iter/s)": 0.042273 }, { "clip_ratio": 0.0, "completion_length": 29.65, "epoch": 0.17575757575757575, "grad_norm": 1.1876083612442017, "kl": 0.04276580810546875, "learning_rate": 2e-07, "loss": 0.009293363988399505, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.23333333805203438, "reward_std": 0.25639069378376006, "rewards/MultiModalAccuracyORM": 0.23333333805203438, "step": 435, "train_speed(iter/s)": 0.042306 }, { "clip_ratio": 0.0, "completion_length": 24.1, "epoch": 0.17777777777777778, "grad_norm": 1.259384274482727, "kl": 0.09014434814453125, "learning_rate": 2e-07, "loss": 0.07308403849601745, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2083333373069763, "reward_std": 0.2925831705331802, "rewards/MultiModalAccuracyORM": 0.2083333373069763, "step": 440, "train_speed(iter/s)": 0.042352 }, { "clip_ratio": 0.0, "completion_length": 7.1, "epoch": 0.1797979797979798, "grad_norm": 1.2361171245574951, "kl": 0.0314971923828125, "learning_rate": 2e-07, "loss": -0.04375269114971161, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.30833333656191825, "reward_std": 0.29863070249557494, "rewards/MultiModalAccuracyORM": 0.30833333656191825, "step": 445, "train_speed(iter/s)": 0.042392 }, { "clip_ratio": 0.0, "completion_length": 6.65, "epoch": 0.18181818181818182, "grad_norm": 2.4363491535186768, "kl": 0.07178993225097656, "learning_rate": 2e-07, "loss": 0.0028454601764678956, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.32500000596046447, "reward_std": 0.41791602969169617, "rewards/MultiModalAccuracyORM": 0.32500000596046447, "step": 450, "train_speed(iter/s)": 0.042441 }, { "clip_ratio": 0.0, "completion_length": 6.6, "epoch": 0.18383838383838383, "grad_norm": 12.971217155456543, "kl": 0.05601959228515625, "learning_rate": 2e-07, "loss": 0.012572245299816131, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.25833333656191826, "reward_std": 0.22704698145389557, "rewards/MultiModalAccuracyORM": 0.25833333656191826, "step": 455, "train_speed(iter/s)": 0.042477 }, { "clip_ratio": 0.0, "completion_length": 32.85, "epoch": 0.18585858585858586, "grad_norm": 11.262785911560059, "kl": 0.014653778076171875, "learning_rate": 2e-07, "loss": 0.005643188953399658, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2750000022351742, "reward_std": 0.26040059328079224, "rewards/MultiModalAccuracyORM": 0.2750000022351742, "step": 460, "train_speed(iter/s)": 0.042456 }, { "clip_ratio": 0.0, "completion_length": 17.2, "epoch": 0.18787878787878787, "grad_norm": 9.14407730102539, "kl": 0.03995361328125, "learning_rate": 2e-07, "loss": 0.0012056897394359112, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.22500000521540642, "reward_std": 0.3923635810613632, "rewards/MultiModalAccuracyORM": 0.22500000521540642, "step": 465, "train_speed(iter/s)": 0.042452 }, { "clip_ratio": 0.0, "completion_length": 5.2, "epoch": 0.1898989898989899, "grad_norm": 2.3540585041046143, "kl": 0.041180419921875, "learning_rate": 2e-07, "loss": 0.018683533370494842, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.10000000149011612, "reward_std": 0.20722824335098267, "rewards/MultiModalAccuracyORM": 0.10000000149011612, "step": 470, "train_speed(iter/s)": 0.042503 }, { "clip_ratio": 0.0, "completion_length": 27.25, "epoch": 0.1919191919191919, "grad_norm": 6.397303581237793, "kl": 0.02938995361328125, "learning_rate": 2e-07, "loss": 0.005294787883758545, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.308333333581686, "reward_std": 0.31422091126441953, "rewards/MultiModalAccuracyORM": 0.308333333581686, "step": 475, "train_speed(iter/s)": 0.042517 }, { "clip_ratio": 0.0, "completion_length": 11.45, "epoch": 0.19393939393939394, "grad_norm": 15.569790840148926, "kl": 0.07780342102050782, "learning_rate": 2e-07, "loss": 0.012630045413970947, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.24166667088866234, "reward_std": 0.36667739152908324, "rewards/MultiModalAccuracyORM": 0.24166667088866234, "step": 480, "train_speed(iter/s)": 0.042512 }, { "clip_ratio": 0.0, "completion_length": 9.95, "epoch": 0.19595959595959597, "grad_norm": 12.205713272094727, "kl": 0.02214508056640625, "learning_rate": 2e-07, "loss": 0.012730973958969116, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.35833333656191824, "reward_std": 0.25566026866436004, "rewards/MultiModalAccuracyORM": 0.35833333656191824, "step": 485, "train_speed(iter/s)": 0.042552 }, { "clip_ratio": 0.0, "completion_length": 16.55, "epoch": 0.19797979797979798, "grad_norm": 0.97981858253479, "kl": 0.05444526672363281, "learning_rate": 2e-07, "loss": 0.006719142198562622, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.10000000074505806, "reward_std": 0.203472563624382, "rewards/MultiModalAccuracyORM": 0.10000000074505806, "step": 490, "train_speed(iter/s)": 0.04257 }, { "clip_ratio": 0.0, "completion_length": 31.85, "epoch": 0.2, "grad_norm": 2.1149213314056396, "kl": 0.06137847900390625, "learning_rate": 2e-07, "loss": 0.04113571047782898, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.38333334401249886, "reward_std": 0.3259632259607315, "rewards/MultiModalAccuracyORM": 0.38333334401249886, "step": 495, "train_speed(iter/s)": 0.042559 }, { "epoch": 0.20202020202020202, "grad_norm": 18.28374671936035, "learning_rate": 2e-07, "loss": 0.0038329623639583588, "memory(GiB)": 104.49, "step": 500, "train_speed(iter/s)": 0.042571 }, { "epoch": 0.20202020202020202, "eval_clip_ratio": 0.0, "eval_completion_length": 26.648334164619445, "eval_kl": 0.08782589912414551, "eval_loss": 7.593631835334236e-06, "eval_response_clip_ratio": 0.001666666716337204, "eval_reward": 0.2816666740179062, "eval_reward_std": 0.3331107318401337, "eval_rewards/MultiModalAccuracyORM": 0.2816666740179062, "eval_runtime": 274.2098, "eval_samples_per_second": 0.182, "eval_steps_per_second": 0.018, "step": 500 }, { "clip_ratio": 0.0, "completion_length": 11.25, "epoch": 0.20404040404040405, "grad_norm": 6.910037517547607, "kl": 0.07545309066772461, "learning_rate": 2e-07, "loss": 0.02395549863576889, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.30416667498648164, "reward_std": 0.2502841353416443, "rewards/MultiModalAccuracyORM": 0.30416667498648164, "step": 505, "train_speed(iter/s)": 0.041389 }, { "clip_ratio": 0.0, "completion_length": 5.3, "epoch": 0.20606060606060606, "grad_norm": 7.303215503692627, "kl": 0.03816680908203125, "learning_rate": 2e-07, "loss": 0.012394474446773529, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.30000000521540643, "reward_std": 0.20363159775733947, "rewards/MultiModalAccuracyORM": 0.30000000521540643, "step": 510, "train_speed(iter/s)": 0.041415 }, { "clip_ratio": 0.0, "completion_length": 37.0, "epoch": 0.2080808080808081, "grad_norm": 2.0224409103393555, "kl": 0.038478851318359375, "learning_rate": 2e-07, "loss": -0.017507487535476686, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2583333387970924, "reward_std": 0.37155145704746245, "rewards/MultiModalAccuracyORM": 0.2583333387970924, "step": 515, "train_speed(iter/s)": 0.041435 }, { "clip_ratio": 0.0, "completion_length": 22.9, "epoch": 0.2101010101010101, "grad_norm": 9.651928901672363, "kl": 0.00984039306640625, "learning_rate": 2e-07, "loss": -0.002422221563756466, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3500000014901161, "reward_std": 0.30187162160873415, "rewards/MultiModalAccuracyORM": 0.3500000014901161, "step": 520, "train_speed(iter/s)": 0.041478 }, { "clip_ratio": 0.0, "completion_length": 9.75, "epoch": 0.21212121212121213, "grad_norm": 5.6520562171936035, "kl": 0.031005859375, "learning_rate": 2e-07, "loss": 0.00025533935986459254, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2916666716337204, "reward_std": 0.2338038921356201, "rewards/MultiModalAccuracyORM": 0.2916666716337204, "step": 525, "train_speed(iter/s)": 0.041505 }, { "clip_ratio": 0.0, "completion_length": 6.5, "epoch": 0.21414141414141413, "grad_norm": 20.748729705810547, "kl": 0.0915985107421875, "learning_rate": 2e-07, "loss": -0.01767445057630539, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3000000096857548, "reward_std": 0.38835368156433103, "rewards/MultiModalAccuracyORM": 0.3000000096857548, "step": 530, "train_speed(iter/s)": 0.041524 }, { "clip_ratio": 0.0, "completion_length": 11.45, "epoch": 0.21616161616161617, "grad_norm": 0.023180894553661346, "kl": 0.07088775634765625, "learning_rate": 2e-07, "loss": 0.029787826538085937, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.37500000521540644, "reward_std": 0.2526735752820969, "rewards/MultiModalAccuracyORM": 0.37500000521540644, "step": 535, "train_speed(iter/s)": 0.041552 }, { "clip_ratio": 0.0, "completion_length": 9.2, "epoch": 0.21818181818181817, "grad_norm": 16.621583938598633, "kl": 0.05093994140625, "learning_rate": 2e-07, "loss": -0.009274721145629883, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.29166667312383654, "reward_std": 0.28561058938503264, "rewards/MultiModalAccuracyORM": 0.29166667312383654, "step": 540, "train_speed(iter/s)": 0.041581 }, { "clip_ratio": 0.0, "completion_length": 8.45, "epoch": 0.2202020202020202, "grad_norm": 17.103206634521484, "kl": 0.0737823486328125, "learning_rate": 2e-07, "loss": 0.021037888526916505, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3583333432674408, "reward_std": 0.28561058938503264, "rewards/MultiModalAccuracyORM": 0.3583333432674408, "step": 545, "train_speed(iter/s)": 0.041645 }, { "clip_ratio": 0.0, "completion_length": 28.85, "epoch": 0.2222222222222222, "grad_norm": 1.5227787494659424, "kl": 0.07874641418457032, "learning_rate": 2e-07, "loss": 0.00487855076789856, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.17500000223517417, "reward_std": 0.21779412031173706, "rewards/MultiModalAccuracyORM": 0.17500000223517417, "step": 550, "train_speed(iter/s)": 0.041506 }, { "clip_ratio": 0.0, "completion_length": 17.6, "epoch": 0.22424242424242424, "grad_norm": 13.277663230895996, "kl": 0.039247894287109376, "learning_rate": 2e-07, "loss": 0.008411864936351775, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.13333333656191826, "reward_std": 0.29784068167209626, "rewards/MultiModalAccuracyORM": 0.13333333656191826, "step": 555, "train_speed(iter/s)": 0.041504 }, { "clip_ratio": 0.0, "completion_length": 34.2, "epoch": 0.22626262626262628, "grad_norm": 0.10883937031030655, "kl": 0.06273307800292968, "learning_rate": 2e-07, "loss": 0.012170317023992539, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2666666679084301, "reward_std": 0.12333081662654877, "rewards/MultiModalAccuracyORM": 0.2666666679084301, "step": 560, "train_speed(iter/s)": 0.041519 }, { "clip_ratio": 0.0, "completion_length": 12.5, "epoch": 0.22828282828282828, "grad_norm": 12.209307670593262, "kl": 0.04704780578613281, "learning_rate": 2e-07, "loss": 0.032337296009063723, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.17500000074505806, "reward_std": 0.2626924514770508, "rewards/MultiModalAccuracyORM": 0.17500000074505806, "step": 565, "train_speed(iter/s)": 0.04155 }, { "clip_ratio": 0.0, "completion_length": 7.85, "epoch": 0.23030303030303031, "grad_norm": 4.45810079574585, "kl": 0.05213623046875, "learning_rate": 2e-07, "loss": 0.001686885952949524, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.30000000447034836, "reward_std": 0.25891573131084444, "rewards/MultiModalAccuracyORM": 0.30000000447034836, "step": 570, "train_speed(iter/s)": 0.041578 }, { "clip_ratio": 0.0, "completion_length": 65.55, "epoch": 0.23232323232323232, "grad_norm": 0.6504287719726562, "kl": 0.08351707458496094, "learning_rate": 2e-07, "loss": 0.016631042957305907, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.1500000014901161, "reward_std": 0.26906835436820986, "rewards/MultiModalAccuracyORM": 0.1500000014901161, "step": 575, "train_speed(iter/s)": 0.041538 }, { "clip_ratio": 0.0, "completion_length": 9.85, "epoch": 0.23434343434343435, "grad_norm": 27.585575103759766, "kl": 0.1207763671875, "learning_rate": 2e-07, "loss": -0.036790531873703, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3000000067055225, "reward_std": 0.3860618233680725, "rewards/MultiModalAccuracyORM": 0.3000000067055225, "step": 580, "train_speed(iter/s)": 0.041563 }, { "clip_ratio": 0.0, "completion_length": 10.3, "epoch": 0.23636363636363636, "grad_norm": 10.094830513000488, "kl": 0.04735574722290039, "learning_rate": 2e-07, "loss": 0.008206900209188461, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.19166667461395265, "reward_std": 0.23631438612937927, "rewards/MultiModalAccuracyORM": 0.19166667461395265, "step": 585, "train_speed(iter/s)": 0.041593 }, { "clip_ratio": 0.0, "completion_length": 127.6, "epoch": 0.2383838383838384, "grad_norm": 3.5195720195770264, "kl": 0.03963155746459961, "learning_rate": 2e-07, "loss": 0.027892309427261352, "memory(GiB)": 104.49, "response_clip_ratio": 0.05, "reward": 0.22500000521540642, "reward_std": 0.22224706113338472, "rewards/MultiModalAccuracyORM": 0.22500000521540642, "step": 590, "train_speed(iter/s)": 0.041543 }, { "clip_ratio": 0.0, "completion_length": 8.75, "epoch": 0.2404040404040404, "grad_norm": 12.612972259521484, "kl": 0.0610992431640625, "learning_rate": 2e-07, "loss": -0.022297632694244385, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.4000000089406967, "reward_std": 0.33376438319683077, "rewards/MultiModalAccuracyORM": 0.4000000089406967, "step": 595, "train_speed(iter/s)": 0.041563 }, { "clip_ratio": 0.0, "completion_length": 21.75, "epoch": 0.24242424242424243, "grad_norm": 1.1488845348358154, "kl": 0.06821136474609375, "learning_rate": 2e-07, "loss": 0.03176195621490478, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.22500000074505805, "reward_std": 0.3149157464504242, "rewards/MultiModalAccuracyORM": 0.22500000074505805, "step": 600, "train_speed(iter/s)": 0.041549 }, { "clip_ratio": 0.0, "completion_length": 8.5, "epoch": 0.24444444444444444, "grad_norm": 4.132078170776367, "kl": 0.07441596984863282, "learning_rate": 2e-07, "loss": 0.004773074015974999, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.13333333879709244, "reward_std": 0.2323044866323471, "rewards/MultiModalAccuracyORM": 0.13333333879709244, "step": 605, "train_speed(iter/s)": 0.041583 }, { "clip_ratio": 0.0, "completion_length": 16.85, "epoch": 0.24646464646464647, "grad_norm": 3.0928878784179688, "kl": 0.050506591796875, "learning_rate": 2e-07, "loss": 0.0011304418556392192, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.27500000670552255, "reward_std": 0.31520852744579314, "rewards/MultiModalAccuracyORM": 0.27500000670552255, "step": 610, "train_speed(iter/s)": 0.041586 }, { "clip_ratio": 0.0, "completion_length": 8.45, "epoch": 0.24848484848484848, "grad_norm": 13.133064270019531, "kl": 0.05210723876953125, "learning_rate": 2e-07, "loss": -0.009364684671163559, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.20833334028720857, "reward_std": 0.28003925681114195, "rewards/MultiModalAccuracyORM": 0.20833334028720857, "step": 615, "train_speed(iter/s)": 0.041615 }, { "clip_ratio": 0.0, "completion_length": 14.05, "epoch": 0.2505050505050505, "grad_norm": 21.168598175048828, "kl": 0.06778411865234375, "learning_rate": 2e-07, "loss": -0.006833799928426742, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3000000096857548, "reward_std": 0.330559054017067, "rewards/MultiModalAccuracyORM": 0.3000000096857548, "step": 620, "train_speed(iter/s)": 0.041639 }, { "clip_ratio": 0.0, "completion_length": 7.35, "epoch": 0.25252525252525254, "grad_norm": 16.575620651245117, "kl": 0.05116090774536133, "learning_rate": 2e-07, "loss": -0.016651205718517303, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2250000074505806, "reward_std": 0.28859728276729585, "rewards/MultiModalAccuracyORM": 0.2250000074505806, "step": 625, "train_speed(iter/s)": 0.041672 }, { "clip_ratio": 0.0, "completion_length": 9.7, "epoch": 0.2545454545454545, "grad_norm": 3.503321886062622, "kl": 0.0628082275390625, "learning_rate": 2e-07, "loss": -0.008116110414266586, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2666666716337204, "reward_std": 0.2330589234828949, "rewards/MultiModalAccuracyORM": 0.2666666716337204, "step": 630, "train_speed(iter/s)": 0.041685 }, { "clip_ratio": 0.0, "completion_length": 6.0, "epoch": 0.25656565656565655, "grad_norm": 15.203675270080566, "kl": 0.06846466064453124, "learning_rate": 2e-07, "loss": 0.011408740282058715, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.38333334028720856, "reward_std": 0.3008869707584381, "rewards/MultiModalAccuracyORM": 0.38333334028720856, "step": 635, "train_speed(iter/s)": 0.04173 }, { "clip_ratio": 0.0, "completion_length": 15.05, "epoch": 0.2585858585858586, "grad_norm": 32.77607727050781, "kl": 0.12814788818359374, "learning_rate": 2e-07, "loss": -0.0371063232421875, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3250000081956387, "reward_std": 0.36673698723316195, "rewards/MultiModalAccuracyORM": 0.3250000081956387, "step": 640, "train_speed(iter/s)": 0.041758 }, { "clip_ratio": 0.0, "completion_length": 19.0, "epoch": 0.2606060606060606, "grad_norm": 15.344500541687012, "kl": 0.105792236328125, "learning_rate": 2e-07, "loss": -0.006553761661052704, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.20833333507180213, "reward_std": 0.2597057640552521, "rewards/MultiModalAccuracyORM": 0.20833333507180213, "step": 645, "train_speed(iter/s)": 0.041814 }, { "clip_ratio": 0.0, "completion_length": 46.8, "epoch": 0.26262626262626265, "grad_norm": 16.03054428100586, "kl": 0.04459686279296875, "learning_rate": 2e-07, "loss": 0.036105594038963316, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.22500000298023223, "reward_std": 0.2403598755598068, "rewards/MultiModalAccuracyORM": 0.22500000298023223, "step": 650, "train_speed(iter/s)": 0.041793 }, { "clip_ratio": 0.0, "completion_length": 16.05, "epoch": 0.26464646464646463, "grad_norm": 17.309656143188477, "kl": 0.11004905700683594, "learning_rate": 2e-07, "loss": 0.017519061267375947, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2083333395421505, "reward_std": 0.3682032287120819, "rewards/MultiModalAccuracyORM": 0.2083333395421505, "step": 655, "train_speed(iter/s)": 0.04181 }, { "clip_ratio": 0.0, "completion_length": 13.3, "epoch": 0.26666666666666666, "grad_norm": 4.0642170906066895, "kl": 0.054970169067382814, "learning_rate": 2e-07, "loss": -0.008081305027008056, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.28333333879709244, "reward_std": 0.23230449259281158, "rewards/MultiModalAccuracyORM": 0.28333333879709244, "step": 660, "train_speed(iter/s)": 0.041838 }, { "clip_ratio": 0.0, "completion_length": 45.25, "epoch": 0.2686868686868687, "grad_norm": 7.022747993469238, "kl": 0.10093574523925782, "learning_rate": 2e-07, "loss": 0.027714025974273682, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.14166666865348815, "reward_std": 0.2531497746706009, "rewards/MultiModalAccuracyORM": 0.14166666865348815, "step": 665, "train_speed(iter/s)": 0.041812 }, { "clip_ratio": 0.0, "completion_length": 33.75, "epoch": 0.27070707070707073, "grad_norm": 9.984959602355957, "kl": 0.023084259033203124, "learning_rate": 2e-07, "loss": 0.026220232248306274, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.10000000149011612, "reward_std": 0.20661829113960267, "rewards/MultiModalAccuracyORM": 0.10000000149011612, "step": 670, "train_speed(iter/s)": 0.041806 }, { "clip_ratio": 0.0, "completion_length": 38.05, "epoch": 0.2727272727272727, "grad_norm": 7.702730178833008, "kl": 0.16024627685546874, "learning_rate": 2e-07, "loss": -0.049201831221580505, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.32500000670552254, "reward_std": 0.20817729830741882, "rewards/MultiModalAccuracyORM": 0.32500000670552254, "step": 675, "train_speed(iter/s)": 0.041821 }, { "clip_ratio": 0.0, "completion_length": 33.9, "epoch": 0.27474747474747474, "grad_norm": 0.16480083763599396, "kl": 0.03549041748046875, "learning_rate": 2e-07, "loss": 0.006150122731924057, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.20833333432674409, "reward_std": 0.2323400765657425, "rewards/MultiModalAccuracyORM": 0.20833333432674409, "step": 680, "train_speed(iter/s)": 0.041846 }, { "clip_ratio": 0.0, "completion_length": 10.8, "epoch": 0.2767676767676768, "grad_norm": 0.027387158945202827, "kl": 0.10235595703125, "learning_rate": 2e-07, "loss": 0.02902156114578247, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.15833333507180214, "reward_std": 0.13583914041519166, "rewards/MultiModalAccuracyORM": 0.15833333507180214, "step": 685, "train_speed(iter/s)": 0.041856 }, { "clip_ratio": 0.0, "completion_length": 13.5, "epoch": 0.2787878787878788, "grad_norm": 6.602695465087891, "kl": 0.0608123779296875, "learning_rate": 2e-07, "loss": 0.012946502864360809, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.21666667312383653, "reward_std": 0.33300994634628295, "rewards/MultiModalAccuracyORM": 0.21666667312383653, "step": 690, "train_speed(iter/s)": 0.04186 }, { "clip_ratio": 0.0, "completion_length": 5.4, "epoch": 0.2808080808080808, "grad_norm": 3.4819886684417725, "kl": 0.12022647857666016, "learning_rate": 2e-07, "loss": 0.02661624550819397, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.20000000298023224, "reward_std": 0.1981794685125351, "rewards/MultiModalAccuracyORM": 0.20000000298023224, "step": 695, "train_speed(iter/s)": 0.041875 }, { "clip_ratio": 0.0, "completion_length": 8.45, "epoch": 0.2828282828282828, "grad_norm": 9.789923667907715, "kl": 0.06219940185546875, "learning_rate": 2e-07, "loss": -0.0169070765376091, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.24166667088866234, "reward_std": 0.31451369225978854, "rewards/MultiModalAccuracyORM": 0.24166667088866234, "step": 700, "train_speed(iter/s)": 0.041904 }, { "clip_ratio": 0.0, "completion_length": 26.7, "epoch": 0.28484848484848485, "grad_norm": 4.8883514404296875, "kl": 0.0865386962890625, "learning_rate": 2e-07, "loss": -0.01697884649038315, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.41666667312383654, "reward_std": 0.37380772531032563, "rewards/MultiModalAccuracyORM": 0.41666667312383654, "step": 705, "train_speed(iter/s)": 0.041918 }, { "clip_ratio": 0.0, "completion_length": 6.65, "epoch": 0.2868686868686869, "grad_norm": 0.24715355038642883, "kl": 0.1329193115234375, "learning_rate": 2e-07, "loss": 0.030154657363891602, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2916666746139526, "reward_std": 0.1888910174369812, "rewards/MultiModalAccuracyORM": 0.2916666746139526, "step": 710, "train_speed(iter/s)": 0.041945 }, { "clip_ratio": 0.0, "completion_length": 15.05, "epoch": 0.28888888888888886, "grad_norm": 20.6412296295166, "kl": 0.0775299072265625, "learning_rate": 2e-07, "loss": 0.010814273357391357, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3666666731238365, "reward_std": 0.23236408829689026, "rewards/MultiModalAccuracyORM": 0.3666666731238365, "step": 715, "train_speed(iter/s)": 0.041844 }, { "clip_ratio": 0.0, "completion_length": 67.7, "epoch": 0.2909090909090909, "grad_norm": 19.74690055847168, "kl": 0.0287322998046875, "learning_rate": 2e-07, "loss": 0.011786083877086639, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3916666753590107, "reward_std": 0.3597048044204712, "rewards/MultiModalAccuracyORM": 0.3916666753590107, "step": 720, "train_speed(iter/s)": 0.041856 }, { "clip_ratio": 0.0, "completion_length": 7.95, "epoch": 0.29292929292929293, "grad_norm": 12.01062297821045, "kl": 0.0283416748046875, "learning_rate": 2e-07, "loss": 0.030677640438079835, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3083333447575569, "reward_std": 0.37494559586048126, "rewards/MultiModalAccuracyORM": 0.3083333447575569, "step": 725, "train_speed(iter/s)": 0.041837 }, { "clip_ratio": 0.0, "completion_length": 7.1, "epoch": 0.29494949494949496, "grad_norm": 18.26583480834961, "kl": 0.048813819885253906, "learning_rate": 2e-07, "loss": 0.00018847386818379163, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.15000000298023225, "reward_std": 0.2855865776538849, "rewards/MultiModalAccuracyORM": 0.15000000298023225, "step": 730, "train_speed(iter/s)": 0.041864 }, { "clip_ratio": 0.0, "completion_length": 8.2, "epoch": 0.296969696969697, "grad_norm": 23.585920333862305, "kl": 0.10856704711914063, "learning_rate": 2e-07, "loss": -0.010623668134212495, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2833333402872086, "reward_std": 0.2486636757850647, "rewards/MultiModalAccuracyORM": 0.2833333402872086, "step": 735, "train_speed(iter/s)": 0.041867 }, { "clip_ratio": 0.0, "completion_length": 36.85, "epoch": 0.298989898989899, "grad_norm": 13.779229164123535, "kl": 0.16164474487304686, "learning_rate": 2e-07, "loss": 0.09003554582595825, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3500000096857548, "reward_std": 0.3144781023263931, "rewards/MultiModalAccuracyORM": 0.3500000096857548, "step": 740, "train_speed(iter/s)": 0.041866 }, { "clip_ratio": 0.0, "completion_length": 8.1, "epoch": 0.301010101010101, "grad_norm": 5.112743377685547, "kl": 0.06104888916015625, "learning_rate": 2e-07, "loss": 0.006612183898687363, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.13333333730697633, "reward_std": 0.24261614382267, "rewards/MultiModalAccuracyORM": 0.13333333730697633, "step": 745, "train_speed(iter/s)": 0.041874 }, { "epoch": 0.30303030303030304, "grad_norm": 3.3870651721954346, "learning_rate": 2e-07, "loss": 0.007025846093893051, "memory(GiB)": 104.49, "step": 750, "train_speed(iter/s)": 0.041879 }, { "epoch": 0.30303030303030304, "eval_clip_ratio": 0.0, "eval_completion_length": 26.371667375564574, "eval_kl": 0.08423469543457031, "eval_loss": 0.020288411527872086, "eval_response_clip_ratio": 0.0, "eval_reward": 0.3050000049173832, "eval_reward_std": 0.28924588978290555, "eval_rewards/MultiModalAccuracyORM": 0.3050000049173832, "eval_runtime": 257.2173, "eval_samples_per_second": 0.194, "eval_steps_per_second": 0.019, "step": 750 }, { "clip_ratio": 0.0, "completion_length": 25.575, "epoch": 0.30505050505050507, "grad_norm": 3.0410096645355225, "kl": 0.09359779357910156, "learning_rate": 2e-07, "loss": 0.01778276413679123, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.20416667126119137, "reward_std": 0.21572377979755403, "rewards/MultiModalAccuracyORM": 0.20416667126119137, "step": 755, "train_speed(iter/s)": 0.041122 }, { "clip_ratio": 0.0, "completion_length": 10.25, "epoch": 0.30707070707070705, "grad_norm": 13.25398063659668, "kl": 0.0601959228515625, "learning_rate": 2e-07, "loss": -0.023943953216075897, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.11666666939854622, "reward_std": 0.24010565578937532, "rewards/MultiModalAccuracyORM": 0.11666666939854622, "step": 760, "train_speed(iter/s)": 0.041142 }, { "clip_ratio": 0.0, "completion_length": 5.05, "epoch": 0.3090909090909091, "grad_norm": 0.06504862755537033, "kl": 0.0304901123046875, "learning_rate": 2e-07, "loss": -0.007498346269130707, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3833333387970924, "reward_std": 0.3021644026041031, "rewards/MultiModalAccuracyORM": 0.3833333387970924, "step": 765, "train_speed(iter/s)": 0.041185 }, { "clip_ratio": 0.0, "completion_length": 50.95, "epoch": 0.3111111111111111, "grad_norm": 18.189159393310547, "kl": 0.0461090087890625, "learning_rate": 2e-07, "loss": -0.0027750393375754355, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2916666716337204, "reward_std": 0.29709570705890653, "rewards/MultiModalAccuracyORM": 0.2916666716337204, "step": 770, "train_speed(iter/s)": 0.041169 }, { "clip_ratio": 0.0, "completion_length": 19.85, "epoch": 0.31313131313131315, "grad_norm": 0.3038291931152344, "kl": 0.03930206298828125, "learning_rate": 2e-07, "loss": -0.0053185861557722095, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.1666666731238365, "reward_std": 0.2386302560567856, "rewards/MultiModalAccuracyORM": 0.1666666731238365, "step": 775, "train_speed(iter/s)": 0.041176 }, { "clip_ratio": 0.0, "completion_length": 21.35, "epoch": 0.3151515151515151, "grad_norm": 10.563432693481445, "kl": 0.02420806884765625, "learning_rate": 2e-07, "loss": -0.005909685418009758, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.10833333432674408, "reward_std": 0.2135300010442734, "rewards/MultiModalAccuracyORM": 0.10833333432674408, "step": 780, "train_speed(iter/s)": 0.041208 }, { "clip_ratio": 0.0, "completion_length": 17.1, "epoch": 0.31717171717171716, "grad_norm": 5.078320503234863, "kl": 0.026453018188476562, "learning_rate": 2e-07, "loss": 0.0009352466091513634, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2666666738688946, "reward_std": 0.256683474779129, "rewards/MultiModalAccuracyORM": 0.2666666738688946, "step": 785, "train_speed(iter/s)": 0.041229 }, { "clip_ratio": 0.0, "completion_length": 12.35, "epoch": 0.3191919191919192, "grad_norm": 10.143798828125, "kl": 0.03321533203125, "learning_rate": 2e-07, "loss": 0.012424397468566894, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2750000037252903, "reward_std": 0.26597192585468293, "rewards/MultiModalAccuracyORM": 0.2750000037252903, "step": 790, "train_speed(iter/s)": 0.041249 }, { "clip_ratio": 0.0, "completion_length": 163.15, "epoch": 0.3212121212121212, "grad_norm": 0.5449197888374329, "kl": 0.019189453125, "learning_rate": 2e-07, "loss": 0.030487871170043944, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2583333447575569, "reward_std": 0.39207376539707184, "rewards/MultiModalAccuracyORM": 0.2583333447575569, "step": 795, "train_speed(iter/s)": 0.041183 }, { "clip_ratio": 0.0, "completion_length": 22.25, "epoch": 0.32323232323232326, "grad_norm": 1.004371166229248, "kl": 0.037581253051757815, "learning_rate": 2e-07, "loss": 0.0017656445503234862, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2666666693985462, "reward_std": 0.25270916521549225, "rewards/MultiModalAccuracyORM": 0.2666666693985462, "step": 800, "train_speed(iter/s)": 0.041199 }, { "clip_ratio": 0.0, "completion_length": 13.95, "epoch": 0.32525252525252524, "grad_norm": 19.628896713256836, "kl": 0.053558349609375, "learning_rate": 2e-07, "loss": -0.021615955233573913, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.1666666693985462, "reward_std": 0.32094223201274874, "rewards/MultiModalAccuracyORM": 0.1666666693985462, "step": 805, "train_speed(iter/s)": 0.041213 }, { "clip_ratio": 0.0, "completion_length": 12.7, "epoch": 0.32727272727272727, "grad_norm": 6.42383337020874, "kl": 0.18563766479492189, "learning_rate": 2e-07, "loss": 0.033368897438049314, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.25833333656191826, "reward_std": 0.2744703501462936, "rewards/MultiModalAccuracyORM": 0.25833333656191826, "step": 810, "train_speed(iter/s)": 0.041234 }, { "clip_ratio": 0.0, "completion_length": 12.8, "epoch": 0.3292929292929293, "grad_norm": 3.2321925163269043, "kl": 0.08846683502197265, "learning_rate": 2e-07, "loss": 0.003480428457260132, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.15000000298023225, "reward_std": 0.2953897833824158, "rewards/MultiModalAccuracyORM": 0.15000000298023225, "step": 815, "train_speed(iter/s)": 0.041242 }, { "clip_ratio": 0.0, "completion_length": 18.85, "epoch": 0.33131313131313134, "grad_norm": 5.854945659637451, "kl": 0.011492156982421875, "learning_rate": 2e-07, "loss": -0.008568185567855834, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2666666716337204, "reward_std": 0.3172461599111557, "rewards/MultiModalAccuracyORM": 0.2666666716337204, "step": 820, "train_speed(iter/s)": 0.041263 }, { "clip_ratio": 0.0, "completion_length": 7.25, "epoch": 0.3333333333333333, "grad_norm": 17.020723342895508, "kl": 0.029691314697265624, "learning_rate": 2e-07, "loss": -0.010567378997802735, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2583333395421505, "reward_std": 0.34933354556560514, "rewards/MultiModalAccuracyORM": 0.2583333395421505, "step": 825, "train_speed(iter/s)": 0.041286 }, { "clip_ratio": 0.0, "completion_length": 9.2, "epoch": 0.33535353535353535, "grad_norm": 12.575139999389648, "kl": 0.0603668212890625, "learning_rate": 2e-07, "loss": -0.0004529397003352642, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3833333432674408, "reward_std": 0.30291883945465087, "rewards/MultiModalAccuracyORM": 0.3833333432674408, "step": 830, "train_speed(iter/s)": 0.041299 }, { "clip_ratio": 0.0, "completion_length": 7.7, "epoch": 0.3373737373737374, "grad_norm": 2.0305564403533936, "kl": 0.08530197143554688, "learning_rate": 2e-07, "loss": -0.0174559086561203, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3083333380520344, "reward_std": 0.36567819118499756, "rewards/MultiModalAccuracyORM": 0.3083333380520344, "step": 835, "train_speed(iter/s)": 0.041331 }, { "clip_ratio": 0.0, "completion_length": 59.2, "epoch": 0.3393939393939394, "grad_norm": 6.523157119750977, "kl": 0.098590087890625, "learning_rate": 2e-07, "loss": -0.014323845505714417, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2916666716337204, "reward_std": 0.31820976436138154, "rewards/MultiModalAccuracyORM": 0.2916666716337204, "step": 840, "train_speed(iter/s)": 0.04132 }, { "clip_ratio": 0.0, "completion_length": 123.2, "epoch": 0.3414141414141414, "grad_norm": 4.560072422027588, "kl": 0.010162353515625, "learning_rate": 2e-07, "loss": 0.02465280294418335, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.22500000447034835, "reward_std": 0.25512446761131286, "rewards/MultiModalAccuracyORM": 0.22500000447034835, "step": 845, "train_speed(iter/s)": 0.041291 }, { "clip_ratio": 0.0, "completion_length": 52.65, "epoch": 0.3434343434343434, "grad_norm": 0.2115914523601532, "kl": 0.1222564697265625, "learning_rate": 2e-07, "loss": 0.01849503219127655, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.1416666679084301, "reward_std": 0.18255070447921753, "rewards/MultiModalAccuracyORM": 0.1416666679084301, "step": 850, "train_speed(iter/s)": 0.041263 }, { "clip_ratio": 0.0, "completion_length": 17.6, "epoch": 0.34545454545454546, "grad_norm": 8.007162094116211, "kl": 0.06471099853515624, "learning_rate": 2e-07, "loss": -0.027201026678085327, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.41666667386889455, "reward_std": 0.27853985130786896, "rewards/MultiModalAccuracyORM": 0.41666667386889455, "step": 855, "train_speed(iter/s)": 0.041287 }, { "clip_ratio": 0.0, "completion_length": 14.55, "epoch": 0.3474747474747475, "grad_norm": 14.470208168029785, "kl": 0.07525177001953125, "learning_rate": 2e-07, "loss": -0.01188465803861618, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.33333334028720857, "reward_std": 0.3921093553304672, "rewards/MultiModalAccuracyORM": 0.33333334028720857, "step": 860, "train_speed(iter/s)": 0.041297 }, { "clip_ratio": 0.0, "completion_length": 16.45, "epoch": 0.34949494949494947, "grad_norm": 11.233606338500977, "kl": 0.10040740966796875, "learning_rate": 2e-07, "loss": 0.02309779226779938, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3000000037252903, "reward_std": 0.23634997606277466, "rewards/MultiModalAccuracyORM": 0.3000000037252903, "step": 865, "train_speed(iter/s)": 0.041313 }, { "clip_ratio": 0.0, "completion_length": 91.9, "epoch": 0.3515151515151515, "grad_norm": 22.588499069213867, "kl": 0.11612701416015625, "learning_rate": 2e-07, "loss": 0.020076577365398408, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.15000000298023225, "reward_std": 0.26677650213241577, "rewards/MultiModalAccuracyORM": 0.15000000298023225, "step": 870, "train_speed(iter/s)": 0.041279 }, { "clip_ratio": 0.0, "completion_length": 15.75, "epoch": 0.35353535353535354, "grad_norm": 8.226666450500488, "kl": 0.05343475341796875, "learning_rate": 2e-07, "loss": -0.014575448632240296, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3333333395421505, "reward_std": 0.37199449837207793, "rewards/MultiModalAccuracyORM": 0.3333333395421505, "step": 875, "train_speed(iter/s)": 0.041283 }, { "clip_ratio": 0.0, "completion_length": 81.1, "epoch": 0.35555555555555557, "grad_norm": 0.5393237471580505, "kl": 0.0892120361328125, "learning_rate": 2e-07, "loss": 0.006313225626945496, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2833333365619183, "reward_std": 0.20661829113960267, "rewards/MultiModalAccuracyORM": 0.2833333365619183, "step": 880, "train_speed(iter/s)": 0.041271 }, { "clip_ratio": 0.0, "completion_length": 26.1, "epoch": 0.3575757575757576, "grad_norm": 0.05410289764404297, "kl": 0.015875244140625, "learning_rate": 2e-07, "loss": 0.0006564079783856868, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.28333333432674407, "reward_std": 0.14888326525688172, "rewards/MultiModalAccuracyORM": 0.28333333432674407, "step": 885, "train_speed(iter/s)": 0.04124 }, { "clip_ratio": 0.0, "completion_length": 51.8, "epoch": 0.3595959595959596, "grad_norm": 16.54722785949707, "kl": 0.0899993896484375, "learning_rate": 2e-07, "loss": 0.010663460195064544, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3083333410322666, "reward_std": 0.2636228919029236, "rewards/MultiModalAccuracyORM": 0.3083333410322666, "step": 890, "train_speed(iter/s)": 0.041246 }, { "clip_ratio": 0.0, "completion_length": 19.9, "epoch": 0.3616161616161616, "grad_norm": 10.844444274902344, "kl": 0.194525146484375, "learning_rate": 2e-07, "loss": -0.04198589324951172, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.4833333432674408, "reward_std": 0.36594696044921876, "rewards/MultiModalAccuracyORM": 0.4833333432674408, "step": 895, "train_speed(iter/s)": 0.04126 }, { "clip_ratio": 0.0, "completion_length": 30.25, "epoch": 0.36363636363636365, "grad_norm": 5.428062915802002, "kl": 0.0639495849609375, "learning_rate": 2e-07, "loss": -0.029673090577125548, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.21666667237877846, "reward_std": 0.2916341096162796, "rewards/MultiModalAccuracyORM": 0.21666667237877846, "step": 900, "train_speed(iter/s)": 0.041259 }, { "clip_ratio": 0.0, "completion_length": 6.9, "epoch": 0.3656565656565657, "grad_norm": 0.12221446633338928, "kl": 0.07857627868652343, "learning_rate": 2e-07, "loss": -0.016133570671081544, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.37500000447034837, "reward_std": 0.2135299950838089, "rewards/MultiModalAccuracyORM": 0.37500000447034837, "step": 905, "train_speed(iter/s)": 0.041282 }, { "clip_ratio": 0.0, "completion_length": 4.65, "epoch": 0.36767676767676766, "grad_norm": 28.893342971801758, "kl": 0.09071540832519531, "learning_rate": 2e-07, "loss": 0.006183768063783646, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3583333358168602, "reward_std": 0.23309451341629028, "rewards/MultiModalAccuracyORM": 0.3583333358168602, "step": 910, "train_speed(iter/s)": 0.041302 }, { "clip_ratio": 0.0, "completion_length": 8.4, "epoch": 0.3696969696969697, "grad_norm": 0.04850845783948898, "kl": 0.03702239990234375, "learning_rate": 2e-07, "loss": 0.031198829412460327, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3083333380520344, "reward_std": 0.2325587123632431, "rewards/MultiModalAccuracyORM": 0.3083333380520344, "step": 915, "train_speed(iter/s)": 0.041316 }, { "clip_ratio": 0.0, "completion_length": 88.5, "epoch": 0.3717171717171717, "grad_norm": 6.006438732147217, "kl": 0.09772415161132812, "learning_rate": 2e-07, "loss": 0.02726798951625824, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3083333417773247, "reward_std": 0.3043610692024231, "rewards/MultiModalAccuracyORM": 0.3083333417773247, "step": 920, "train_speed(iter/s)": 0.041277 }, { "clip_ratio": 0.0, "completion_length": 35.3, "epoch": 0.37373737373737376, "grad_norm": 1.342499852180481, "kl": 0.011273193359375, "learning_rate": 2e-07, "loss": 0.00134199857711792, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.15000000074505807, "reward_std": 0.13182924091815948, "rewards/MultiModalAccuracyORM": 0.15000000074505807, "step": 925, "train_speed(iter/s)": 0.041269 }, { "clip_ratio": 0.0, "completion_length": 12.45, "epoch": 0.37575757575757573, "grad_norm": 3.2022011280059814, "kl": 0.165765380859375, "learning_rate": 2e-07, "loss": -0.004855489730834961, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.35833333879709245, "reward_std": 0.33156771659851075, "rewards/MultiModalAccuracyORM": 0.35833333879709245, "step": 930, "train_speed(iter/s)": 0.041287 }, { "clip_ratio": 0.0, "completion_length": 50.05, "epoch": 0.37777777777777777, "grad_norm": 0.07847103476524353, "kl": 0.05077667236328125, "learning_rate": 2e-07, "loss": -0.023166632652282713, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.4166666716337204, "reward_std": 0.32526595890522003, "rewards/MultiModalAccuracyORM": 0.4166666716337204, "step": 935, "train_speed(iter/s)": 0.041275 }, { "clip_ratio": 0.0, "completion_length": 24.15, "epoch": 0.3797979797979798, "grad_norm": 18.610437393188477, "kl": 0.03169517517089844, "learning_rate": 2e-07, "loss": 0.024595724046230318, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.18333334028720855, "reward_std": 0.3267322063446045, "rewards/MultiModalAccuracyORM": 0.18333334028720855, "step": 940, "train_speed(iter/s)": 0.041277 }, { "clip_ratio": 0.0, "completion_length": 13.45, "epoch": 0.38181818181818183, "grad_norm": 5.941343784332275, "kl": 0.0661346435546875, "learning_rate": 2e-07, "loss": 0.024455997347831725, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2583333387970924, "reward_std": 0.21973656117916107, "rewards/MultiModalAccuracyORM": 0.2583333387970924, "step": 945, "train_speed(iter/s)": 0.04129 }, { "clip_ratio": 0.0, "completion_length": 37.05, "epoch": 0.3838383838383838, "grad_norm": 24.896520614624023, "kl": 0.150213623046875, "learning_rate": 2e-07, "loss": 0.017214223742485046, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.41666667312383654, "reward_std": 0.33557761609554293, "rewards/MultiModalAccuracyORM": 0.41666667312383654, "step": 950, "train_speed(iter/s)": 0.041288 }, { "clip_ratio": 0.0, "completion_length": 15.5, "epoch": 0.38585858585858585, "grad_norm": 8.904081344604492, "kl": 0.12316970825195313, "learning_rate": 2e-07, "loss": -0.0002661585807800293, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3000000022351742, "reward_std": 0.21999078691005708, "rewards/MultiModalAccuracyORM": 0.3000000022351742, "step": 955, "train_speed(iter/s)": 0.041304 }, { "clip_ratio": 0.0, "completion_length": 31.05, "epoch": 0.3878787878787879, "grad_norm": 0.30000391602516174, "kl": 0.193408203125, "learning_rate": 2e-07, "loss": -0.016391244530677796, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.5916666716337204, "reward_std": 0.15219832956790924, "rewards/MultiModalAccuracyORM": 0.5916666716337204, "step": 960, "train_speed(iter/s)": 0.041312 }, { "clip_ratio": 0.0, "completion_length": 9.6, "epoch": 0.3898989898989899, "grad_norm": 1.9883811473846436, "kl": 0.046465301513671876, "learning_rate": 2e-07, "loss": -0.0011612892150878907, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.24166667014360427, "reward_std": 0.23930107951164245, "rewards/MultiModalAccuracyORM": 0.24166667014360427, "step": 965, "train_speed(iter/s)": 0.041313 }, { "clip_ratio": 0.0, "completion_length": 25.4, "epoch": 0.39191919191919194, "grad_norm": 17.314956665039062, "kl": 0.10909576416015625, "learning_rate": 2e-07, "loss": 0.003603992611169815, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.26666666865348815, "reward_std": 0.26976318955421447, "rewards/MultiModalAccuracyORM": 0.26666666865348815, "step": 970, "train_speed(iter/s)": 0.04131 }, { "clip_ratio": 0.0, "completion_length": 26.45, "epoch": 0.3939393939393939, "grad_norm": 2.700242042541504, "kl": 0.0446197509765625, "learning_rate": 2e-07, "loss": -0.024584516882896423, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.09166666865348816, "reward_std": 0.23854664266109465, "rewards/MultiModalAccuracyORM": 0.09166666865348816, "step": 975, "train_speed(iter/s)": 0.041305 }, { "clip_ratio": 0.0, "completion_length": 19.85, "epoch": 0.39595959595959596, "grad_norm": 1.759245753288269, "kl": 0.0932861328125, "learning_rate": 2e-07, "loss": 0.03299914002418518, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.23333334028720856, "reward_std": 0.22785155177116395, "rewards/MultiModalAccuracyORM": 0.23333334028720856, "step": 980, "train_speed(iter/s)": 0.041315 }, { "clip_ratio": 0.0, "completion_length": 32.7, "epoch": 0.397979797979798, "grad_norm": 12.485607147216797, "kl": 0.061135292053222656, "learning_rate": 2e-07, "loss": 0.022333118319511413, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.26666667088866236, "reward_std": 0.27903059422969817, "rewards/MultiModalAccuracyORM": 0.26666667088866236, "step": 985, "train_speed(iter/s)": 0.041318 }, { "clip_ratio": 0.0, "completion_length": 48.65, "epoch": 0.4, "grad_norm": 4.170945644378662, "kl": 0.0902923583984375, "learning_rate": 2e-07, "loss": -0.00014310678234323858, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.21666667014360427, "reward_std": 0.2511145681142807, "rewards/MultiModalAccuracyORM": 0.21666667014360427, "step": 990, "train_speed(iter/s)": 0.041309 }, { "clip_ratio": 0.0, "completion_length": 8.25, "epoch": 0.402020202020202, "grad_norm": 2.5125696659088135, "kl": 0.12824859619140624, "learning_rate": 2e-07, "loss": 0.0361581027507782, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.5250000052154065, "reward_std": 0.2526735752820969, "rewards/MultiModalAccuracyORM": 0.5250000052154065, "step": 995, "train_speed(iter/s)": 0.041331 }, { "epoch": 0.40404040404040403, "grad_norm": 24.84500503540039, "learning_rate": 2e-07, "loss": -0.03532302379608154, "memory(GiB)": 104.49, "step": 1000, "train_speed(iter/s)": 0.041234 }, { "epoch": 0.40404040404040403, "eval_clip_ratio": 0.0, "eval_completion_length": 40.71333456993103, "eval_kl": 0.09849456787109374, "eval_loss": 0.019675862044095993, "eval_response_clip_ratio": 0.0, "eval_reward": 0.36166667461395263, "eval_reward_std": 0.2775319296121597, "eval_rewards/MultiModalAccuracyORM": 0.36166667461395263, "eval_runtime": 294.4392, "eval_samples_per_second": 0.17, "eval_steps_per_second": 0.017, "step": 1000 }, { "clip_ratio": 0.0, "completion_length": 63.525, "epoch": 0.40606060606060607, "grad_norm": 2.5043818950653076, "kl": 0.041501617431640624, "learning_rate": 2e-07, "loss": -0.008308599889278411, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.21666667014360427, "reward_std": 0.21963488459587097, "rewards/MultiModalAccuracyORM": 0.21666667014360427, "step": 1005, "train_speed(iter/s)": 0.040624 }, { "clip_ratio": 0.0, "completion_length": 6.05, "epoch": 0.4080808080808081, "grad_norm": 19.067171096801758, "kl": 0.07535552978515625, "learning_rate": 2e-07, "loss": 0.017892301082611084, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.33333334028720857, "reward_std": 0.22005038857460021, "rewards/MultiModalAccuracyORM": 0.33333334028720857, "step": 1010, "train_speed(iter/s)": 0.040645 }, { "clip_ratio": 0.0, "completion_length": 70.6, "epoch": 0.4101010101010101, "grad_norm": 2.5989065170288086, "kl": 0.0229217529296875, "learning_rate": 2e-07, "loss": 0.040188026428222653, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.21666667014360427, "reward_std": 0.25591449439525604, "rewards/MultiModalAccuracyORM": 0.21666667014360427, "step": 1015, "train_speed(iter/s)": 0.040633 }, { "clip_ratio": 0.0, "completion_length": 58.65, "epoch": 0.4121212121212121, "grad_norm": 11.748002052307129, "kl": 0.06688776016235351, "learning_rate": 2e-07, "loss": -0.0008021335117518902, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.11666666939854622, "reward_std": 0.17150862216949464, "rewards/MultiModalAccuracyORM": 0.11666666939854622, "step": 1020, "train_speed(iter/s)": 0.040631 }, { "clip_ratio": 0.0, "completion_length": 25.15, "epoch": 0.41414141414141414, "grad_norm": 0.12045960873365402, "kl": 0.03296966552734375, "learning_rate": 2e-07, "loss": -0.010370378196239472, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.11666667088866234, "reward_std": 0.17081378698348998, "rewards/MultiModalAccuracyORM": 0.11666667088866234, "step": 1025, "train_speed(iter/s)": 0.040649 }, { "clip_ratio": 0.0, "completion_length": 47.85, "epoch": 0.4161616161616162, "grad_norm": 1.6403871774673462, "kl": 0.06666259765625, "learning_rate": 2e-07, "loss": 0.00585133358836174, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.17500000074505806, "reward_std": 0.17705594301223754, "rewards/MultiModalAccuracyORM": 0.17500000074505806, "step": 1030, "train_speed(iter/s)": 0.040624 }, { "clip_ratio": 0.0, "completion_length": 34.3, "epoch": 0.41818181818181815, "grad_norm": 0.014441369101405144, "kl": 0.07417640686035157, "learning_rate": 2e-07, "loss": -0.010604190826416015, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.22500000298023223, "reward_std": 0.2003761351108551, "rewards/MultiModalAccuracyORM": 0.22500000298023223, "step": 1035, "train_speed(iter/s)": 0.040636 }, { "clip_ratio": 0.0, "completion_length": 47.6, "epoch": 0.4202020202020202, "grad_norm": 6.607668399810791, "kl": 0.1425227165222168, "learning_rate": 2e-07, "loss": 0.02794753313064575, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3583333395421505, "reward_std": 0.2567190647125244, "rewards/MultiModalAccuracyORM": 0.3583333395421505, "step": 1040, "train_speed(iter/s)": 0.040638 }, { "clip_ratio": 0.0, "completion_length": 6.6, "epoch": 0.4222222222222222, "grad_norm": 0.8122760057449341, "kl": 0.1528533935546875, "learning_rate": 2e-07, "loss": 0.019382116198539735, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.17500000670552254, "reward_std": 0.2034369796514511, "rewards/MultiModalAccuracyORM": 0.17500000670552254, "step": 1045, "train_speed(iter/s)": 0.040661 }, { "clip_ratio": 0.0, "completion_length": 29.85, "epoch": 0.42424242424242425, "grad_norm": 0.18659576773643494, "kl": 0.010857391357421874, "learning_rate": 2e-07, "loss": 0.015965181589126586, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.21666667014360427, "reward_std": 0.1660114347934723, "rewards/MultiModalAccuracyORM": 0.21666667014360427, "step": 1050, "train_speed(iter/s)": 0.040661 }, { "clip_ratio": 0.0, "completion_length": 21.25, "epoch": 0.4262626262626263, "grad_norm": 0.4390380382537842, "kl": 0.07591552734375, "learning_rate": 2e-07, "loss": 0.011004485189914703, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.4250000111758709, "reward_std": 0.24862808585166932, "rewards/MultiModalAccuracyORM": 0.4250000111758709, "step": 1055, "train_speed(iter/s)": 0.040671 }, { "clip_ratio": 0.0, "completion_length": 52.95, "epoch": 0.42828282828282827, "grad_norm": 0.3618135452270508, "kl": 0.109490966796875, "learning_rate": 2e-07, "loss": 0.011407237499952316, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3333333387970924, "reward_std": 0.14589657187461852, "rewards/MultiModalAccuracyORM": 0.3333333387970924, "step": 1060, "train_speed(iter/s)": 0.04069 }, { "clip_ratio": 0.0, "completion_length": 47.1, "epoch": 0.4303030303030303, "grad_norm": 13.074536323547363, "kl": 0.18959503173828124, "learning_rate": 2e-07, "loss": 0.04986717700958252, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3000000029802322, "reward_std": 0.14589657187461852, "rewards/MultiModalAccuracyORM": 0.3000000029802322, "step": 1065, "train_speed(iter/s)": 0.040694 }, { "clip_ratio": 0.0, "completion_length": 28.9, "epoch": 0.43232323232323233, "grad_norm": 6.16197395324707, "kl": 0.15793075561523437, "learning_rate": 2e-07, "loss": 0.06019207835197449, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.28333334177732467, "reward_std": 0.2669951319694519, "rewards/MultiModalAccuracyORM": 0.28333334177732467, "step": 1070, "train_speed(iter/s)": 0.040701 }, { "clip_ratio": 0.0, "completion_length": 27.05, "epoch": 0.43434343434343436, "grad_norm": 25.265649795532227, "kl": 0.08460769653320313, "learning_rate": 2e-07, "loss": -0.04109536409378052, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3750000037252903, "reward_std": 0.2325587123632431, "rewards/MultiModalAccuracyORM": 0.3750000037252903, "step": 1075, "train_speed(iter/s)": 0.040703 }, { "clip_ratio": 0.0, "completion_length": 33.0, "epoch": 0.43636363636363634, "grad_norm": 2.5213825702667236, "kl": 0.089501953125, "learning_rate": 2e-07, "loss": 0.011518492549657821, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3333333387970924, "reward_std": 0.35792474150657655, "rewards/MultiModalAccuracyORM": 0.3333333387970924, "step": 1080, "train_speed(iter/s)": 0.040705 }, { "clip_ratio": 0.0, "completion_length": 65.85, "epoch": 0.4383838383838384, "grad_norm": 2.2053442001342773, "kl": 0.014685440063476562, "learning_rate": 2e-07, "loss": -0.03693766593933105, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2666666701436043, "reward_std": 0.2370448112487793, "rewards/MultiModalAccuracyORM": 0.2666666701436043, "step": 1085, "train_speed(iter/s)": 0.040693 }, { "clip_ratio": 0.0, "completion_length": 10.2, "epoch": 0.4404040404040404, "grad_norm": 12.156472206115723, "kl": 0.17877197265625, "learning_rate": 2e-07, "loss": 0.032665693759918214, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3416666746139526, "reward_std": 0.27148365676403047, "rewards/MultiModalAccuracyORM": 0.3416666746139526, "step": 1090, "train_speed(iter/s)": 0.040713 }, { "clip_ratio": 0.0, "completion_length": 8.85, "epoch": 0.44242424242424244, "grad_norm": 1.4023343324661255, "kl": 0.098193359375, "learning_rate": 2e-07, "loss": -0.007838453352451324, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.4000000022351742, "reward_std": 0.172567418217659, "rewards/MultiModalAccuracyORM": 0.4000000022351742, "step": 1095, "train_speed(iter/s)": 0.040737 }, { "clip_ratio": 0.0, "completion_length": 67.6, "epoch": 0.4444444444444444, "grad_norm": 10.351971626281738, "kl": 0.02147979736328125, "learning_rate": 2e-07, "loss": 0.03331095576286316, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3166666775941849, "reward_std": 0.3504018098115921, "rewards/MultiModalAccuracyORM": 0.3166666775941849, "step": 1100, "train_speed(iter/s)": 0.04073 }, { "clip_ratio": 0.0, "completion_length": 12.95, "epoch": 0.44646464646464645, "grad_norm": 13.833907127380371, "kl": 0.019232177734375, "learning_rate": 2e-07, "loss": -0.005460131168365479, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3250000089406967, "reward_std": 0.2667409062385559, "rewards/MultiModalAccuracyORM": 0.3250000089406967, "step": 1105, "train_speed(iter/s)": 0.040741 }, { "clip_ratio": 0.0, "completion_length": 26.65, "epoch": 0.4484848484848485, "grad_norm": 2.0316038131713867, "kl": 0.018201828002929688, "learning_rate": 2e-07, "loss": -0.0024514278396964074, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.35833333879709245, "reward_std": 0.2793444275856018, "rewards/MultiModalAccuracyORM": 0.35833333879709245, "step": 1110, "train_speed(iter/s)": 0.04076 }, { "clip_ratio": 0.0, "completion_length": 7.0, "epoch": 0.4505050505050505, "grad_norm": 15.886459350585938, "kl": 0.21325912475585937, "learning_rate": 2e-07, "loss": 0.0038191914558410645, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.24166666865348815, "reward_std": 0.2245364874601364, "rewards/MultiModalAccuracyORM": 0.24166666865348815, "step": 1115, "train_speed(iter/s)": 0.040791 }, { "clip_ratio": 0.0, "completion_length": 12.4, "epoch": 0.45252525252525255, "grad_norm": 0.03295298293232918, "kl": 0.1110443115234375, "learning_rate": 2e-07, "loss": 0.013870391249656677, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.21666667088866234, "reward_std": 0.27402731478214265, "rewards/MultiModalAccuracyORM": 0.21666667088866234, "step": 1120, "train_speed(iter/s)": 0.040796 }, { "clip_ratio": 0.0, "completion_length": 27.9, "epoch": 0.45454545454545453, "grad_norm": 2.8173696994781494, "kl": 0.0269622802734375, "learning_rate": 2e-07, "loss": 0.03692147135734558, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3833333387970924, "reward_std": 0.2159452974796295, "rewards/MultiModalAccuracyORM": 0.3833333387970924, "step": 1125, "train_speed(iter/s)": 0.04082 }, { "clip_ratio": 0.0, "completion_length": 16.45, "epoch": 0.45656565656565656, "grad_norm": 0.10465247184038162, "kl": 0.04431991577148438, "learning_rate": 2e-07, "loss": 0.003530232235789299, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3166666701436043, "reward_std": 0.2323044866323471, "rewards/MultiModalAccuracyORM": 0.3166666701436043, "step": 1130, "train_speed(iter/s)": 0.040817 }, { "clip_ratio": 0.0, "completion_length": 7.6, "epoch": 0.4585858585858586, "grad_norm": 0.32010194659233093, "kl": 0.094537353515625, "learning_rate": 2e-07, "loss": 0.012909208238124848, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.0916666716337204, "reward_std": 0.1293427586555481, "rewards/MultiModalAccuracyORM": 0.0916666716337204, "step": 1135, "train_speed(iter/s)": 0.040832 }, { "clip_ratio": 0.0, "completion_length": 67.8, "epoch": 0.46060606060606063, "grad_norm": 15.148902893066406, "kl": 0.07255020141601562, "learning_rate": 2e-07, "loss": 0.016760605573654174, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.20000000149011612, "reward_std": 0.2260383188724518, "rewards/MultiModalAccuracyORM": 0.20000000149011612, "step": 1140, "train_speed(iter/s)": 0.040831 }, { "clip_ratio": 0.0, "completion_length": 54.75, "epoch": 0.4626262626262626, "grad_norm": 4.259115219116211, "kl": 0.012025833129882812, "learning_rate": 2e-07, "loss": -0.004991362616419792, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.12500000298023223, "reward_std": 0.2003761351108551, "rewards/MultiModalAccuracyORM": 0.12500000298023223, "step": 1145, "train_speed(iter/s)": 0.040832 }, { "clip_ratio": 0.0, "completion_length": 8.7, "epoch": 0.46464646464646464, "grad_norm": 4.517999649047852, "kl": 0.0364471435546875, "learning_rate": 2e-07, "loss": 0.0014625540003180503, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.37500000149011614, "reward_std": 0.18561154305934907, "rewards/MultiModalAccuracyORM": 0.37500000149011614, "step": 1150, "train_speed(iter/s)": 0.040853 }, { "clip_ratio": 0.0, "completion_length": 10.1, "epoch": 0.4666666666666667, "grad_norm": 9.037857055664062, "kl": 0.066754150390625, "learning_rate": 2e-07, "loss": 0.023162148892879486, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3000000029802322, "reward_std": 0.14589657187461852, "rewards/MultiModalAccuracyORM": 0.3000000029802322, "step": 1155, "train_speed(iter/s)": 0.040895 }, { "clip_ratio": 0.0, "completion_length": 5.7, "epoch": 0.4686868686868687, "grad_norm": 0.35684671998023987, "kl": 0.1403411865234375, "learning_rate": 2e-07, "loss": 0.011607617139816284, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.47500001043081286, "reward_std": 0.19340355396270753, "rewards/MultiModalAccuracyORM": 0.47500001043081286, "step": 1160, "train_speed(iter/s)": 0.040919 }, { "clip_ratio": 0.0, "completion_length": 12.4, "epoch": 0.4707070707070707, "grad_norm": 0.18109376728534698, "kl": 0.0370758056640625, "learning_rate": 2e-07, "loss": -0.0030417680740356446, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.21666667014360427, "reward_std": 0.1848811239004135, "rewards/MultiModalAccuracyORM": 0.21666667014360427, "step": 1165, "train_speed(iter/s)": 0.040938 }, { "clip_ratio": 0.0, "completion_length": 8.8, "epoch": 0.4727272727272727, "grad_norm": 17.05179786682129, "kl": 0.027799224853515624, "learning_rate": 2e-07, "loss": -0.01608174741268158, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.35833333656191824, "reward_std": 0.25566026866436004, "rewards/MultiModalAccuracyORM": 0.35833333656191824, "step": 1170, "train_speed(iter/s)": 0.040961 }, { "clip_ratio": 0.0, "completion_length": 35.55, "epoch": 0.47474747474747475, "grad_norm": 2.053295850753784, "kl": 0.0653228759765625, "learning_rate": 2e-07, "loss": 0.0025410931557416916, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.30000000149011613, "reward_std": 0.18780820965766906, "rewards/MultiModalAccuracyORM": 0.30000000149011613, "step": 1175, "train_speed(iter/s)": 0.040966 }, { "clip_ratio": 0.0, "completion_length": 3.65, "epoch": 0.4767676767676768, "grad_norm": 12.327520370483398, "kl": 0.1503997802734375, "learning_rate": 2e-07, "loss": 0.00606456995010376, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.47500000298023226, "reward_std": 0.16696292161941528, "rewards/MultiModalAccuracyORM": 0.47500000298023226, "step": 1180, "train_speed(iter/s)": 0.040998 }, { "clip_ratio": 0.0, "completion_length": 9.1, "epoch": 0.47878787878787876, "grad_norm": 0.1990954726934433, "kl": 0.26718597412109374, "learning_rate": 2e-07, "loss": 0.011653450131416321, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.38333333656191826, "reward_std": 0.27402731478214265, "rewards/MultiModalAccuracyORM": 0.38333333656191826, "step": 1185, "train_speed(iter/s)": 0.041009 }, { "clip_ratio": 0.0, "completion_length": 7.5, "epoch": 0.4808080808080808, "grad_norm": 5.806619644165039, "kl": 0.059732818603515626, "learning_rate": 2e-07, "loss": -0.013705405592918395, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.09166667088866234, "reward_std": 0.12558708488941192, "rewards/MultiModalAccuracyORM": 0.09166667088866234, "step": 1190, "train_speed(iter/s)": 0.041032 }, { "clip_ratio": 0.0, "completion_length": 8.2, "epoch": 0.48282828282828283, "grad_norm": 12.781750679016113, "kl": 0.04134521484375, "learning_rate": 2e-07, "loss": -0.008668276667594909, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.4416666693985462, "reward_std": 0.2597057580947876, "rewards/MultiModalAccuracyORM": 0.4416666693985462, "step": 1195, "train_speed(iter/s)": 0.041043 }, { "clip_ratio": 0.0, "completion_length": 6.45, "epoch": 0.48484848484848486, "grad_norm": 3.4121592044830322, "kl": 0.073028564453125, "learning_rate": 2e-07, "loss": -0.0033960781991481783, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2833333447575569, "reward_std": 0.38452682793140414, "rewards/MultiModalAccuracyORM": 0.2833333447575569, "step": 1200, "train_speed(iter/s)": 0.041068 }, { "clip_ratio": 0.0, "completion_length": 19.65, "epoch": 0.4868686868686869, "grad_norm": 2.179175615310669, "kl": 0.1186309814453125, "learning_rate": 2e-07, "loss": 0.0020799320191144943, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.5500000081956387, "reward_std": 0.383985635638237, "rewards/MultiModalAccuracyORM": 0.5500000081956387, "step": 1205, "train_speed(iter/s)": 0.041076 }, { "clip_ratio": 0.0, "completion_length": 18.0, "epoch": 0.4888888888888889, "grad_norm": 16.699316024780273, "kl": 0.19964828491210937, "learning_rate": 2e-07, "loss": 0.07210339307785034, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.41666667014360426, "reward_std": 0.27151924669742583, "rewards/MultiModalAccuracyORM": 0.41666667014360426, "step": 1210, "train_speed(iter/s)": 0.041084 }, { "clip_ratio": 0.0, "completion_length": 12.2, "epoch": 0.4909090909090909, "grad_norm": 11.2245512008667, "kl": 0.02044839859008789, "learning_rate": 2e-07, "loss": 0.0006846427917480469, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3166666738688946, "reward_std": 0.22074522376060485, "rewards/MultiModalAccuracyORM": 0.3166666738688946, "step": 1215, "train_speed(iter/s)": 0.041094 }, { "clip_ratio": 0.0, "completion_length": 10.95, "epoch": 0.49292929292929294, "grad_norm": 23.733837127685547, "kl": 0.0533355712890625, "learning_rate": 2e-07, "loss": -0.03312296569347382, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.21666666939854623, "reward_std": 0.3827823489904404, "rewards/MultiModalAccuracyORM": 0.21666666939854623, "step": 1220, "train_speed(iter/s)": 0.041103 }, { "clip_ratio": 0.0, "completion_length": 7.4, "epoch": 0.494949494949495, "grad_norm": 5.569579124450684, "kl": 0.12704048156738282, "learning_rate": 2e-07, "loss": -0.030297344923019408, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.41666667386889455, "reward_std": 0.3534030467271805, "rewards/MultiModalAccuracyORM": 0.41666667386889455, "step": 1225, "train_speed(iter/s)": 0.041105 }, { "clip_ratio": 0.0, "completion_length": 6.15, "epoch": 0.49696969696969695, "grad_norm": 13.687773704528809, "kl": 0.054621124267578126, "learning_rate": 2e-07, "loss": 0.020814248919487, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.29166667312383654, "reward_std": 0.2981545031070709, "rewards/MultiModalAccuracyORM": 0.29166667312383654, "step": 1230, "train_speed(iter/s)": 0.041117 }, { "clip_ratio": 0.0, "completion_length": 15.45, "epoch": 0.498989898989899, "grad_norm": 4.014401912689209, "kl": 0.11805038452148438, "learning_rate": 2e-07, "loss": -0.014261078834533692, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.31666666865348814, "reward_std": 0.24615318179130555, "rewards/MultiModalAccuracyORM": 0.31666666865348814, "step": 1235, "train_speed(iter/s)": 0.041139 }, { "clip_ratio": 0.0, "completion_length": 57.15, "epoch": 0.501010101010101, "grad_norm": 7.063708782196045, "kl": 0.04602813720703125, "learning_rate": 2e-07, "loss": -0.0014480194076895714, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3500000052154064, "reward_std": 0.2486636757850647, "rewards/MultiModalAccuracyORM": 0.3500000052154064, "step": 1240, "train_speed(iter/s)": 0.041135 }, { "clip_ratio": 0.0, "completion_length": 15.65, "epoch": 0.503030303030303, "grad_norm": 0.07285178452730179, "kl": 0.06838836669921874, "learning_rate": 2e-07, "loss": 0.007464568316936493, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.25833333656191826, "reward_std": 0.26703072190284727, "rewards/MultiModalAccuracyORM": 0.25833333656191826, "step": 1245, "train_speed(iter/s)": 0.04113 }, { "epoch": 0.5050505050505051, "grad_norm": 16.691085815429688, "learning_rate": 2e-07, "loss": 0.027106884121894836, "memory(GiB)": 104.49, "step": 1250, "train_speed(iter/s)": 0.041132 }, { "epoch": 0.5050505050505051, "eval_clip_ratio": 0.0, "eval_completion_length": 24.193333625793457, "eval_kl": 0.0990032958984375, "eval_loss": 0.013061273843050003, "eval_response_clip_ratio": 0.0, "eval_reward": 0.3783333380520344, "eval_reward_std": 0.21932941377162934, "eval_rewards/MultiModalAccuracyORM": 0.3783333380520344, "eval_runtime": 254.2733, "eval_samples_per_second": 0.197, "eval_steps_per_second": 0.02, "step": 1250 }, { "clip_ratio": 0.0, "completion_length": 13.45, "epoch": 0.5070707070707071, "grad_norm": 1.7288111448287964, "kl": 0.14322261810302733, "learning_rate": 2e-07, "loss": -0.0040175896137952805, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.32916667200624944, "reward_std": 0.21599168032407762, "rewards/MultiModalAccuracyORM": 0.32916667200624944, "step": 1255, "train_speed(iter/s)": 0.040698 }, { "clip_ratio": 0.0, "completion_length": 7.8, "epoch": 0.509090909090909, "grad_norm": 30.862096786499023, "kl": 0.065618896484375, "learning_rate": 2e-07, "loss": 0.03462098240852356, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3166666753590107, "reward_std": 0.3471368789672852, "rewards/MultiModalAccuracyORM": 0.3166666753590107, "step": 1260, "train_speed(iter/s)": 0.040722 }, { "clip_ratio": 0.0, "completion_length": 56.65, "epoch": 0.5111111111111111, "grad_norm": 18.206647872924805, "kl": 0.050946044921875, "learning_rate": 2e-07, "loss": -0.018359455466270446, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.37500001192092897, "reward_std": 0.285042542219162, "rewards/MultiModalAccuracyORM": 0.37500001192092897, "step": 1265, "train_speed(iter/s)": 0.040714 }, { "clip_ratio": 0.0, "completion_length": 32.8, "epoch": 0.5131313131313131, "grad_norm": 21.11511993408203, "kl": 0.08178558349609374, "learning_rate": 2e-07, "loss": 0.019801269471645355, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.5000000059604645, "reward_std": 0.24666163325309753, "rewards/MultiModalAccuracyORM": 0.5000000059604645, "step": 1270, "train_speed(iter/s)": 0.040716 }, { "clip_ratio": 0.0, "completion_length": 33.7, "epoch": 0.5151515151515151, "grad_norm": 2.3435275554656982, "kl": 0.037060546875, "learning_rate": 2e-07, "loss": -0.044399937987327574, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.30000000149011613, "reward_std": 0.18780821561813354, "rewards/MultiModalAccuracyORM": 0.30000000149011613, "step": 1275, "train_speed(iter/s)": 0.040729 }, { "clip_ratio": 0.0, "completion_length": 32.3, "epoch": 0.5171717171717172, "grad_norm": 6.154475688934326, "kl": 0.06382598876953124, "learning_rate": 2e-07, "loss": 0.024791686236858367, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.38333334103226663, "reward_std": 0.3026406019926071, "rewards/MultiModalAccuracyORM": 0.38333334103226663, "step": 1280, "train_speed(iter/s)": 0.040736 }, { "clip_ratio": 0.0, "completion_length": 72.85, "epoch": 0.5191919191919192, "grad_norm": 0.17857688665390015, "kl": 0.05196533203125, "learning_rate": 2e-07, "loss": -0.01656932532787323, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.1916666716337204, "reward_std": 0.32905964851379393, "rewards/MultiModalAccuracyORM": 0.1916666716337204, "step": 1285, "train_speed(iter/s)": 0.040739 }, { "clip_ratio": 0.0, "completion_length": 18.1, "epoch": 0.5212121212121212, "grad_norm": 5.7444353103637695, "kl": 0.032296371459960935, "learning_rate": 2e-07, "loss": -0.04405757784843445, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.19166667237877846, "reward_std": 0.3292782843112946, "rewards/MultiModalAccuracyORM": 0.19166667237877846, "step": 1290, "train_speed(iter/s)": 0.04075 }, { "clip_ratio": 0.0, "completion_length": 9.45, "epoch": 0.5232323232323233, "grad_norm": 1.938860297203064, "kl": 0.04727783203125, "learning_rate": 2e-07, "loss": 0.001994212530553341, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.4666666701436043, "reward_std": 0.2953156381845474, "rewards/MultiModalAccuracyORM": 0.4666666701436043, "step": 1295, "train_speed(iter/s)": 0.040768 }, { "clip_ratio": 0.0, "completion_length": 8.9, "epoch": 0.5252525252525253, "grad_norm": 23.327890396118164, "kl": 0.118865966796875, "learning_rate": 2e-07, "loss": 0.020175328850746153, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3833333492279053, "reward_std": 0.3596546709537506, "rewards/MultiModalAccuracyORM": 0.3833333492279053, "step": 1300, "train_speed(iter/s)": 0.04078 }, { "clip_ratio": 0.0, "completion_length": 19.4, "epoch": 0.5272727272727272, "grad_norm": 1.2604830265045166, "kl": 0.082135009765625, "learning_rate": 2e-07, "loss": -0.006745982170104981, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3250000089406967, "reward_std": 0.23631438612937927, "rewards/MultiModalAccuracyORM": 0.3250000089406967, "step": 1305, "train_speed(iter/s)": 0.040788 }, { "clip_ratio": 0.0, "completion_length": 15.1, "epoch": 0.5292929292929293, "grad_norm": 19.63453483581543, "kl": 0.093505859375, "learning_rate": 2e-07, "loss": -0.01361556351184845, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.33333334177732465, "reward_std": 0.24337058067321776, "rewards/MultiModalAccuracyORM": 0.33333334177732465, "step": 1310, "train_speed(iter/s)": 0.0408 }, { "clip_ratio": 0.0, "completion_length": 64.75, "epoch": 0.5313131313131313, "grad_norm": 5.953737735748291, "kl": 0.115643310546875, "learning_rate": 2e-07, "loss": 0.004205666109919548, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.30833334028720855, "reward_std": 0.21123813688755036, "rewards/MultiModalAccuracyORM": 0.30833334028720855, "step": 1315, "train_speed(iter/s)": 0.040801 }, { "clip_ratio": 0.0, "completion_length": 6.55, "epoch": 0.5333333333333333, "grad_norm": 24.937227249145508, "kl": 0.1268402099609375, "learning_rate": 2e-07, "loss": 0.0015925129875540734, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.26666667237877845, "reward_std": 0.31119862794876096, "rewards/MultiModalAccuracyORM": 0.26666667237877845, "step": 1320, "train_speed(iter/s)": 0.040816 }, { "clip_ratio": 0.0, "completion_length": 11.35, "epoch": 0.5353535353535354, "grad_norm": 0.8153337240219116, "kl": 0.150848388671875, "learning_rate": 2e-07, "loss": -0.021095672249794008, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.21666667088866234, "reward_std": 0.27402731478214265, "rewards/MultiModalAccuracyORM": 0.21666667088866234, "step": 1325, "train_speed(iter/s)": 0.040834 }, { "clip_ratio": 0.0, "completion_length": 10.35, "epoch": 0.5373737373737374, "grad_norm": 18.53838539123535, "kl": 0.046075439453125, "learning_rate": 2e-07, "loss": 0.017172405123710634, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.4500000089406967, "reward_std": 0.2159452974796295, "rewards/MultiModalAccuracyORM": 0.4500000089406967, "step": 1330, "train_speed(iter/s)": 0.040851 }, { "clip_ratio": 0.0, "completion_length": 33.05, "epoch": 0.5393939393939394, "grad_norm": 7.678282737731934, "kl": 0.0884857177734375, "learning_rate": 2e-07, "loss": 0.0011547883972525597, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.5750000141561031, "reward_std": 0.3044206708669662, "rewards/MultiModalAccuracyORM": 0.5750000141561031, "step": 1335, "train_speed(iter/s)": 0.04087 }, { "clip_ratio": 0.0, "completion_length": 8.7, "epoch": 0.5414141414141415, "grad_norm": 10.90495777130127, "kl": 0.0806304931640625, "learning_rate": 2e-07, "loss": -0.017473408579826356, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2750000022351742, "reward_std": 0.15518502295017242, "rewards/MultiModalAccuracyORM": 0.2750000022351742, "step": 1340, "train_speed(iter/s)": 0.040877 }, { "clip_ratio": 0.0, "completion_length": 10.55, "epoch": 0.5434343434343434, "grad_norm": 0.10261930525302887, "kl": 0.060321044921875, "learning_rate": 2e-07, "loss": 0.0017479043453931808, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.33333334028720857, "reward_std": 0.152222341299057, "rewards/MultiModalAccuracyORM": 0.33333334028720857, "step": 1345, "train_speed(iter/s)": 0.040892 }, { "clip_ratio": 0.0, "completion_length": 10.75, "epoch": 0.5454545454545454, "grad_norm": 2.2841360569000244, "kl": 0.024788665771484374, "learning_rate": 2e-07, "loss": -0.02739916443824768, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.20833333656191827, "reward_std": 0.2652174890041351, "rewards/MultiModalAccuracyORM": 0.20833333656191827, "step": 1350, "train_speed(iter/s)": 0.040901 }, { "clip_ratio": 0.0, "completion_length": 11.85, "epoch": 0.5474747474747474, "grad_norm": 13.731690406799316, "kl": 0.0828125, "learning_rate": 2e-07, "loss": -0.0664910078048706, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.4333333417773247, "reward_std": 0.3362748771905899, "rewards/MultiModalAccuracyORM": 0.4333333417773247, "step": 1355, "train_speed(iter/s)": 0.04092 }, { "clip_ratio": 0.0, "completion_length": 11.95, "epoch": 0.5494949494949495, "grad_norm": 25.35189437866211, "kl": 0.100750732421875, "learning_rate": 2e-07, "loss": -0.00892333835363388, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3083333432674408, "reward_std": 0.2915389180183411, "rewards/MultiModalAccuracyORM": 0.3083333432674408, "step": 1360, "train_speed(iter/s)": 0.04093 }, { "clip_ratio": 0.0, "completion_length": 18.1, "epoch": 0.5515151515151515, "grad_norm": 9.685708999633789, "kl": 0.061480712890625, "learning_rate": 2e-07, "loss": 0.012898986041545869, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.30000000521540643, "reward_std": 0.21447905600070954, "rewards/MultiModalAccuracyORM": 0.30000000521540643, "step": 1365, "train_speed(iter/s)": 0.040933 }, { "clip_ratio": 0.0, "completion_length": 8.45, "epoch": 0.5535353535353535, "grad_norm": 0.28964653611183167, "kl": 0.1938751220703125, "learning_rate": 2e-07, "loss": 0.01745934933423996, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3416666701436043, "reward_std": 0.2489179015159607, "rewards/MultiModalAccuracyORM": 0.3416666701436043, "step": 1370, "train_speed(iter/s)": 0.040944 }, { "clip_ratio": 0.0, "completion_length": 17.25, "epoch": 0.5555555555555556, "grad_norm": 8.731843948364258, "kl": 0.06651153564453124, "learning_rate": 2e-07, "loss": 0.03409457206726074, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.35833333656191824, "reward_std": 0.25566026866436004, "rewards/MultiModalAccuracyORM": 0.35833333656191824, "step": 1375, "train_speed(iter/s)": 0.040953 }, { "clip_ratio": 0.0, "completion_length": 10.2, "epoch": 0.5575757575757576, "grad_norm": 35.31602096557617, "kl": 0.100604248046875, "learning_rate": 2e-07, "loss": -0.010587018728256226, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2500000111758709, "reward_std": 0.25487024188041685, "rewards/MultiModalAccuracyORM": 0.2500000111758709, "step": 1380, "train_speed(iter/s)": 0.040972 }, { "clip_ratio": 0.0, "completion_length": 12.55, "epoch": 0.5595959595959596, "grad_norm": 1.9312275648117065, "kl": 0.09021759033203125, "learning_rate": 2e-07, "loss": -0.012255148589611053, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.24166667610406875, "reward_std": 0.255184069275856, "rewards/MultiModalAccuracyORM": 0.24166667610406875, "step": 1385, "train_speed(iter/s)": 0.040973 }, { "clip_ratio": 0.0, "completion_length": 15.25, "epoch": 0.5616161616161616, "grad_norm": 30.091777801513672, "kl": 0.08451480865478515, "learning_rate": 2e-07, "loss": -0.004190707206726074, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2500000074505806, "reward_std": 0.2875886201858521, "rewards/MultiModalAccuracyORM": 0.2500000074505806, "step": 1390, "train_speed(iter/s)": 0.040981 }, { "clip_ratio": 0.0, "completion_length": 5.25, "epoch": 0.5636363636363636, "grad_norm": 5.909719467163086, "kl": 0.16330108642578126, "learning_rate": 2e-07, "loss": -0.01449722945690155, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.35000000670552256, "reward_std": 0.15821027159690856, "rewards/MultiModalAccuracyORM": 0.35000000670552256, "step": 1395, "train_speed(iter/s)": 0.040992 }, { "clip_ratio": 0.0, "completion_length": 11.9, "epoch": 0.5656565656565656, "grad_norm": 4.40855598449707, "kl": 0.0266082763671875, "learning_rate": 2e-07, "loss": 0.026001608371734618, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.24166667088866234, "reward_std": 0.3048968702554703, "rewards/MultiModalAccuracyORM": 0.24166667088866234, "step": 1400, "train_speed(iter/s)": 0.041006 }, { "clip_ratio": 0.0, "completion_length": 8.85, "epoch": 0.5676767676767677, "grad_norm": 0.061144277453422546, "kl": 0.07353515625, "learning_rate": 2e-07, "loss": -0.010889561474323272, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.24166667312383652, "reward_std": 0.14815284609794616, "rewards/MultiModalAccuracyORM": 0.24166667312383652, "step": 1405, "train_speed(iter/s)": 0.041018 }, { "clip_ratio": 0.0, "completion_length": 12.6, "epoch": 0.5696969696969697, "grad_norm": 0.037721507251262665, "kl": 0.087078857421875, "learning_rate": 2e-07, "loss": 0.004135938733816147, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.45833333358168604, "reward_std": 0.08109080791473389, "rewards/MultiModalAccuracyORM": 0.45833333358168604, "step": 1410, "train_speed(iter/s)": 0.041023 }, { "clip_ratio": 0.0, "completion_length": 14.2, "epoch": 0.5717171717171717, "grad_norm": 4.825331211090088, "kl": 0.18311767578125, "learning_rate": 2e-07, "loss": 0.02725890576839447, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.24166667684912682, "reward_std": 0.2338038921356201, "rewards/MultiModalAccuracyORM": 0.24166667684912682, "step": 1415, "train_speed(iter/s)": 0.04103 }, { "clip_ratio": 0.0, "completion_length": 32.65, "epoch": 0.5737373737373738, "grad_norm": 1.8680031299591064, "kl": 0.0274566650390625, "learning_rate": 2e-07, "loss": 0.0017455607652664185, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.17500000074505806, "reward_std": 0.12001575231552124, "rewards/MultiModalAccuracyORM": 0.17500000074505806, "step": 1420, "train_speed(iter/s)": 0.041024 }, { "clip_ratio": 0.0, "completion_length": 10.9, "epoch": 0.5757575757575758, "grad_norm": 3.193700075149536, "kl": 0.305999755859375, "learning_rate": 2e-07, "loss": 0.046308600902557374, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.4000000074505806, "reward_std": 0.29177860021591184, "rewards/MultiModalAccuracyORM": 0.4000000074505806, "step": 1425, "train_speed(iter/s)": 0.041037 }, { "clip_ratio": 0.0, "completion_length": 12.95, "epoch": 0.5777777777777777, "grad_norm": 2.843719244003296, "kl": 0.0330230712890625, "learning_rate": 2e-07, "loss": -0.04594253897666931, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.30000000521540643, "reward_std": 0.3008869707584381, "rewards/MultiModalAccuracyORM": 0.30000000521540643, "step": 1430, "train_speed(iter/s)": 0.041054 }, { "clip_ratio": 0.0, "completion_length": 20.0, "epoch": 0.5797979797979798, "grad_norm": 23.12917137145996, "kl": 0.0993408203125, "learning_rate": 2e-07, "loss": 0.021137547492980958, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2166666716337204, "reward_std": 0.2790306001901627, "rewards/MultiModalAccuracyORM": 0.2166666716337204, "step": 1435, "train_speed(iter/s)": 0.041061 }, { "clip_ratio": 0.0, "completion_length": 7.25, "epoch": 0.5818181818181818, "grad_norm": 17.79547882080078, "kl": 0.1023834228515625, "learning_rate": 2e-07, "loss": 0.00415017232298851, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.30000000521540643, "reward_std": 0.20369119942188263, "rewards/MultiModalAccuracyORM": 0.30000000521540643, "step": 1440, "train_speed(iter/s)": 0.041072 }, { "clip_ratio": 0.0, "completion_length": 7.2, "epoch": 0.5838383838383838, "grad_norm": 15.119973182678223, "kl": 0.11974754333496093, "learning_rate": 2e-07, "loss": -0.008057641983032226, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3250000089406967, "reward_std": 0.2770525634288788, "rewards/MultiModalAccuracyORM": 0.3250000089406967, "step": 1445, "train_speed(iter/s)": 0.041081 }, { "clip_ratio": 0.0, "completion_length": 6.6, "epoch": 0.5858585858585859, "grad_norm": 0.13666389882564545, "kl": 0.0672607421875, "learning_rate": 2e-07, "loss": -0.0010352015495300293, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2083333373069763, "reward_std": 0.14996607303619386, "rewards/MultiModalAccuracyORM": 0.2083333373069763, "step": 1450, "train_speed(iter/s)": 0.041098 }, { "clip_ratio": 0.0, "completion_length": 10.3, "epoch": 0.5878787878787879, "grad_norm": 11.365659713745117, "kl": 0.0847259521484375, "learning_rate": 2e-07, "loss": 0.014445498585700989, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.1500000037252903, "reward_std": 0.23955530524253846, "rewards/MultiModalAccuracyORM": 0.1500000037252903, "step": 1455, "train_speed(iter/s)": 0.041108 }, { "clip_ratio": 0.0, "completion_length": 5.5, "epoch": 0.5898989898989899, "grad_norm": 25.425418853759766, "kl": 0.070880126953125, "learning_rate": 2e-07, "loss": 0.00023016731720417737, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3916666753590107, "reward_std": 0.3265491545200348, "rewards/MultiModalAccuracyORM": 0.3916666753590107, "step": 1460, "train_speed(iter/s)": 0.041127 }, { "clip_ratio": 0.0, "completion_length": 12.35, "epoch": 0.591919191919192, "grad_norm": 11.779102325439453, "kl": 0.07333221435546874, "learning_rate": 2e-07, "loss": 0.0254564106464386, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2583333395421505, "reward_std": 0.16626566052436828, "rewards/MultiModalAccuracyORM": 0.2583333395421505, "step": 1465, "train_speed(iter/s)": 0.041143 }, { "clip_ratio": 0.0, "completion_length": 6.55, "epoch": 0.593939393939394, "grad_norm": 1.78038489818573, "kl": 0.1328155517578125, "learning_rate": 2e-07, "loss": 0.008091837167739868, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.27500000298023225, "reward_std": 0.1293427586555481, "rewards/MultiModalAccuracyORM": 0.27500000298023225, "step": 1470, "train_speed(iter/s)": 0.041154 }, { "clip_ratio": 0.0, "completion_length": 35.25, "epoch": 0.5959595959595959, "grad_norm": 2.518378734588623, "kl": 0.1015869140625, "learning_rate": 2e-07, "loss": -0.03122214078903198, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.4333333358168602, "reward_std": 0.3322981417179108, "rewards/MultiModalAccuracyORM": 0.4333333358168602, "step": 1475, "train_speed(iter/s)": 0.041146 }, { "clip_ratio": 0.0, "completion_length": 5.5, "epoch": 0.597979797979798, "grad_norm": 20.898664474487305, "kl": 0.1433135986328125, "learning_rate": 2e-07, "loss": -0.02608821392059326, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2250000059604645, "reward_std": 0.287842845916748, "rewards/MultiModalAccuracyORM": 0.2250000059604645, "step": 1480, "train_speed(iter/s)": 0.041162 }, { "clip_ratio": 0.0, "completion_length": 10.8, "epoch": 0.6, "grad_norm": 0.11180847883224487, "kl": 0.13046875, "learning_rate": 2e-07, "loss": 0.003093409538269043, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.21666666716337205, "reward_std": 0.12937834858894348, "rewards/MultiModalAccuracyORM": 0.21666666716337205, "step": 1485, "train_speed(iter/s)": 0.041171 }, { "clip_ratio": 0.0, "completion_length": 7.85, "epoch": 0.602020202020202, "grad_norm": 12.01523494720459, "kl": 0.187371826171875, "learning_rate": 2e-07, "loss": -0.008616887032985687, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.29166667237877847, "reward_std": 0.21550226211547852, "rewards/MultiModalAccuracyORM": 0.29166667237877847, "step": 1490, "train_speed(iter/s)": 0.041188 }, { "clip_ratio": 0.0, "completion_length": 6.75, "epoch": 0.604040404040404, "grad_norm": 14.021830558776855, "kl": 0.16456298828125, "learning_rate": 2e-07, "loss": 0.010373742878437042, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.5166666701436042, "reward_std": 0.21447905600070954, "rewards/MultiModalAccuracyORM": 0.5166666701436042, "step": 1495, "train_speed(iter/s)": 0.041207 }, { "epoch": 0.6060606060606061, "grad_norm": 1.3669841289520264, "learning_rate": 2e-07, "loss": -0.011987817287445069, "memory(GiB)": 104.49, "step": 1500, "train_speed(iter/s)": 0.041216 }, { "epoch": 0.6060606060606061, "eval_clip_ratio": 0.0, "eval_completion_length": 23.09000030040741, "eval_kl": 0.12807769775390626, "eval_loss": 0.0023684909101575613, "eval_response_clip_ratio": 0.0, "eval_reward": 0.42833334028720854, "eval_reward_std": 0.21841024577617646, "eval_rewards/MultiModalAccuracyORM": 0.42833334028720854, "eval_runtime": 243.0786, "eval_samples_per_second": 0.206, "eval_steps_per_second": 0.021, "step": 1500 }, { "clip_ratio": 0.0, "completion_length": 13.275, "epoch": 0.6080808080808081, "grad_norm": 10.859317779541016, "kl": 0.1035552978515625, "learning_rate": 2e-07, "loss": -0.011110000312328339, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.35000000447034835, "reward_std": 0.2066851645708084, "rewards/MultiModalAccuracyORM": 0.35000000447034835, "step": 1505, "train_speed(iter/s)": 0.040874 }, { "clip_ratio": 0.0, "completion_length": 61.45, "epoch": 0.6101010101010101, "grad_norm": 0.055811017751693726, "kl": 0.03581314086914063, "learning_rate": 2e-07, "loss": 0.04541417956352234, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3916666731238365, "reward_std": 0.33700530230998993, "rewards/MultiModalAccuracyORM": 0.3916666731238365, "step": 1510, "train_speed(iter/s)": 0.040868 }, { "clip_ratio": 0.0, "completion_length": 6.1, "epoch": 0.6121212121212121, "grad_norm": 2.9291131496429443, "kl": 0.076611328125, "learning_rate": 2e-07, "loss": 0.0033688426017761232, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.40000000298023225, "reward_std": 0.22297748029232026, "rewards/MultiModalAccuracyORM": 0.40000000298023225, "step": 1515, "train_speed(iter/s)": 0.040893 }, { "clip_ratio": 0.0, "completion_length": 39.8, "epoch": 0.6141414141414141, "grad_norm": 10.698760032653809, "kl": 0.024103546142578126, "learning_rate": 2e-07, "loss": 0.033906325697898865, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.1416666716337204, "reward_std": 0.24487241208553315, "rewards/MultiModalAccuracyORM": 0.1416666716337204, "step": 1520, "train_speed(iter/s)": 0.040902 }, { "clip_ratio": 0.0, "completion_length": 9.95, "epoch": 0.6161616161616161, "grad_norm": 5.847660541534424, "kl": 0.141815185546875, "learning_rate": 2e-07, "loss": -0.014752772450447083, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.40000000447034834, "reward_std": 0.27756677865982055, "rewards/MultiModalAccuracyORM": 0.40000000447034834, "step": 1525, "train_speed(iter/s)": 0.04091 }, { "clip_ratio": 0.0, "completion_length": 6.85, "epoch": 0.6181818181818182, "grad_norm": 2.933770179748535, "kl": 0.1540740966796875, "learning_rate": 2e-07, "loss": 0.021346482634544372, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.32500001043081284, "reward_std": 0.3767612546682358, "rewards/MultiModalAccuracyORM": 0.32500001043081284, "step": 1530, "train_speed(iter/s)": 0.040919 }, { "clip_ratio": 0.0, "completion_length": 20.35, "epoch": 0.6202020202020202, "grad_norm": 6.487882614135742, "kl": 0.08126373291015625, "learning_rate": 2e-07, "loss": -0.02819029986858368, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.4000000037252903, "reward_std": 0.2875886201858521, "rewards/MultiModalAccuracyORM": 0.4000000037252903, "step": 1535, "train_speed(iter/s)": 0.040926 }, { "clip_ratio": 0.0, "completion_length": 13.7, "epoch": 0.6222222222222222, "grad_norm": 0.1822008639574051, "kl": 0.244976806640625, "learning_rate": 2e-07, "loss": 0.02670127749443054, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.43333333879709246, "reward_std": 0.14589657187461852, "rewards/MultiModalAccuracyORM": 0.43333333879709246, "step": 1540, "train_speed(iter/s)": 0.040936 }, { "clip_ratio": 0.0, "completion_length": 54.35, "epoch": 0.6242424242424243, "grad_norm": 5.22224235534668, "kl": 0.087286376953125, "learning_rate": 2e-07, "loss": 0.011146068572998047, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.39166667237877845, "reward_std": 0.39465543925762175, "rewards/MultiModalAccuracyORM": 0.39166667237877845, "step": 1545, "train_speed(iter/s)": 0.040941 }, { "clip_ratio": 0.0, "completion_length": 46.65, "epoch": 0.6262626262626263, "grad_norm": 12.465606689453125, "kl": 0.11739501953125, "learning_rate": 2e-07, "loss": 0.01348254531621933, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.37500000447034837, "reward_std": 0.18332211077213287, "rewards/MultiModalAccuracyORM": 0.37500000447034837, "step": 1550, "train_speed(iter/s)": 0.040941 }, { "clip_ratio": 0.0, "completion_length": 12.05, "epoch": 0.6282828282828283, "grad_norm": 0.03528100252151489, "kl": 0.059906005859375, "learning_rate": 2e-07, "loss": 0.002536106109619141, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.32500000223517417, "reward_std": 0.12558708488941192, "rewards/MultiModalAccuracyORM": 0.32500000223517417, "step": 1555, "train_speed(iter/s)": 0.040957 }, { "clip_ratio": 0.0, "completion_length": 7.5, "epoch": 0.6303030303030303, "grad_norm": 15.021883010864258, "kl": 0.11079330444335937, "learning_rate": 2e-07, "loss": 0.0029231052845716476, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.4333333380520344, "reward_std": 0.20973873138427734, "rewards/MultiModalAccuracyORM": 0.4333333380520344, "step": 1560, "train_speed(iter/s)": 0.040974 }, { "clip_ratio": 0.0, "completion_length": 15.1, "epoch": 0.6323232323232323, "grad_norm": 2.5578255653381348, "kl": 0.04172821044921875, "learning_rate": 2e-07, "loss": 0.004573901742696762, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.12500000447034837, "reward_std": 0.18087121844291687, "rewards/MultiModalAccuracyORM": 0.12500000447034837, "step": 1565, "train_speed(iter/s)": 0.040988 }, { "clip_ratio": 0.0, "completion_length": 6.8, "epoch": 0.6343434343434343, "grad_norm": 22.243240356445312, "kl": 0.158673095703125, "learning_rate": 2e-07, "loss": -0.008480211347341537, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.41666667759418485, "reward_std": 0.28480601906776426, "rewards/MultiModalAccuracyORM": 0.41666667759418485, "step": 1570, "train_speed(iter/s)": 0.040998 }, { "clip_ratio": 0.0, "completion_length": 22.0, "epoch": 0.6363636363636364, "grad_norm": 25.038570404052734, "kl": 0.1517974853515625, "learning_rate": 2e-07, "loss": 0.04977948367595673, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.36666667386889457, "reward_std": 0.22625695466995238, "rewards/MultiModalAccuracyORM": 0.36666667386889457, "step": 1575, "train_speed(iter/s)": 0.041005 }, { "clip_ratio": 0.0, "completion_length": 9.15, "epoch": 0.6383838383838384, "grad_norm": 0.11025875806808472, "kl": 0.0567169189453125, "learning_rate": 2e-07, "loss": 0.004630526155233383, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3916666679084301, "reward_std": 0.13032740950584412, "rewards/MultiModalAccuracyORM": 0.3916666679084301, "step": 1580, "train_speed(iter/s)": 0.041022 }, { "clip_ratio": 0.0, "completion_length": 29.9, "epoch": 0.6404040404040404, "grad_norm": 8.77802562713623, "kl": 0.06422119140625, "learning_rate": 2e-07, "loss": -0.002487625740468502, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2916666708886623, "reward_std": 0.2822715133428574, "rewards/MultiModalAccuracyORM": 0.2916666708886623, "step": 1585, "train_speed(iter/s)": 0.041027 }, { "clip_ratio": 0.0, "completion_length": 10.1, "epoch": 0.6424242424242425, "grad_norm": 0.061026524752378464, "kl": 0.181072998046875, "learning_rate": 2e-07, "loss": 0.012957209348678589, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.28333333805203437, "reward_std": 0.11928532719612121, "rewards/MultiModalAccuracyORM": 0.28333333805203437, "step": 1590, "train_speed(iter/s)": 0.041041 }, { "clip_ratio": 0.0, "completion_length": 7.6, "epoch": 0.6444444444444445, "grad_norm": 5.596570014953613, "kl": 0.17645263671875, "learning_rate": 2e-07, "loss": -0.0008578440174460411, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2666666753590107, "reward_std": 0.3071531385183334, "rewards/MultiModalAccuracyORM": 0.2666666753590107, "step": 1595, "train_speed(iter/s)": 0.041048 }, { "clip_ratio": 0.0, "completion_length": 71.7, "epoch": 0.6464646464646465, "grad_norm": 26.054533004760742, "kl": 0.11879425048828125, "learning_rate": 2e-07, "loss": 0.007277928292751312, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.37500000894069674, "reward_std": 0.2732968896627426, "rewards/MultiModalAccuracyORM": 0.37500000894069674, "step": 1600, "train_speed(iter/s)": 0.041045 }, { "clip_ratio": 0.0, "completion_length": 44.45, "epoch": 0.6484848484848484, "grad_norm": 0.11397194862365723, "kl": 0.0313624382019043, "learning_rate": 2e-07, "loss": 0.0012240668758749962, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.33333334028720857, "reward_std": 0.152222341299057, "rewards/MultiModalAccuracyORM": 0.33333334028720857, "step": 1605, "train_speed(iter/s)": 0.041049 }, { "clip_ratio": 0.0, "completion_length": 54.7, "epoch": 0.6505050505050505, "grad_norm": 0.8132848739624023, "kl": 0.099078369140625, "learning_rate": 2e-07, "loss": 0.008613920211791993, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.13333333432674407, "reward_std": 0.20110656023025514, "rewards/MultiModalAccuracyORM": 0.13333333432674407, "step": 1610, "train_speed(iter/s)": 0.041055 }, { "clip_ratio": 0.0, "completion_length": 106.2, "epoch": 0.6525252525252525, "grad_norm": 2.1414718627929688, "kl": 0.05146484375, "learning_rate": 2e-07, "loss": 0.0011494815349578857, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.33333333507180213, "reward_std": 0.25270916521549225, "rewards/MultiModalAccuracyORM": 0.33333333507180213, "step": 1615, "train_speed(iter/s)": 0.041049 }, { "clip_ratio": 0.0, "completion_length": 7.7, "epoch": 0.6545454545454545, "grad_norm": 2.636408567428589, "kl": 0.05029296875, "learning_rate": 2e-07, "loss": -0.02351543605327606, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.5083333447575569, "reward_std": 0.2792848199605942, "rewards/MultiModalAccuracyORM": 0.5083333447575569, "step": 1620, "train_speed(iter/s)": 0.041065 }, { "clip_ratio": 0.0, "completion_length": 47.3, "epoch": 0.6565656565656566, "grad_norm": 3.0985336303710938, "kl": 0.065228271484375, "learning_rate": 2e-07, "loss": -0.014748664200305938, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.4833333395421505, "reward_std": 0.16225576102733613, "rewards/MultiModalAccuracyORM": 0.4833333395421505, "step": 1625, "train_speed(iter/s)": 0.041069 }, { "clip_ratio": 0.0, "completion_length": 17.65, "epoch": 0.6585858585858586, "grad_norm": 9.992680549621582, "kl": 0.16975555419921876, "learning_rate": 2e-07, "loss": 0.008018460124731064, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2583333395421505, "reward_std": 0.21368903517723084, "rewards/MultiModalAccuracyORM": 0.2583333395421505, "step": 1630, "train_speed(iter/s)": 0.041077 }, { "clip_ratio": 0.0, "completion_length": 5.95, "epoch": 0.6606060606060606, "grad_norm": 47.361576080322266, "kl": 0.125982666015625, "learning_rate": 2e-07, "loss": 0.015030686557292939, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.4666666753590107, "reward_std": 0.2340581238269806, "rewards/MultiModalAccuracyORM": 0.4666666753590107, "step": 1635, "train_speed(iter/s)": 0.041091 }, { "clip_ratio": 0.0, "completion_length": 15.05, "epoch": 0.6626262626262627, "grad_norm": 6.931950569152832, "kl": 0.16407470703125, "learning_rate": 2e-07, "loss": -0.012672655284404755, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.4166666746139526, "reward_std": 0.215248042345047, "rewards/MultiModalAccuracyORM": 0.4166666746139526, "step": 1640, "train_speed(iter/s)": 0.041096 }, { "clip_ratio": 0.0, "completion_length": 13.3, "epoch": 0.6646464646464646, "grad_norm": 0.08681845664978027, "kl": 0.1269195556640625, "learning_rate": 2e-07, "loss": -0.0032407425343990324, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.5083333358168602, "reward_std": 0.13338824808597566, "rewards/MultiModalAccuracyORM": 0.5083333358168602, "step": 1645, "train_speed(iter/s)": 0.041111 }, { "clip_ratio": 0.0, "completion_length": 45.7, "epoch": 0.6666666666666666, "grad_norm": 3.8581395149230957, "kl": 0.121484375, "learning_rate": 2e-07, "loss": 0.008351793140172958, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.28333333432674407, "reward_std": 0.2581467509269714, "rewards/MultiModalAccuracyORM": 0.28333333432674407, "step": 1650, "train_speed(iter/s)": 0.041103 }, { "clip_ratio": 0.0, "completion_length": 29.15, "epoch": 0.6686868686868687, "grad_norm": 17.391639709472656, "kl": 0.13189697265625, "learning_rate": 2e-07, "loss": 0.056326770782470705, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.4083333410322666, "reward_std": 0.3480859398841858, "rewards/MultiModalAccuracyORM": 0.4083333410322666, "step": 1655, "train_speed(iter/s)": 0.041102 }, { "clip_ratio": 0.0, "completion_length": 9.25, "epoch": 0.6707070707070707, "grad_norm": 7.648516654968262, "kl": 0.2052001953125, "learning_rate": 2e-07, "loss": -0.00421803817152977, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3500000037252903, "reward_std": 0.25897533297538755, "rewards/MultiModalAccuracyORM": 0.3500000037252903, "step": 1660, "train_speed(iter/s)": 0.041107 }, { "clip_ratio": 0.0, "completion_length": 35.35, "epoch": 0.6727272727272727, "grad_norm": 1.1766724586486816, "kl": 0.0945709228515625, "learning_rate": 2e-07, "loss": 0.013910901546478272, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.29166667684912684, "reward_std": 0.24487241804599763, "rewards/MultiModalAccuracyORM": 0.29166667684912684, "step": 1665, "train_speed(iter/s)": 0.041117 }, { "clip_ratio": 0.0, "completion_length": 25.35, "epoch": 0.6747474747474748, "grad_norm": 4.918646335601807, "kl": 0.023187255859375, "learning_rate": 2e-07, "loss": -0.009105654805898667, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.20833333507180213, "reward_std": 0.2074824631214142, "rewards/MultiModalAccuracyORM": 0.20833333507180213, "step": 1670, "train_speed(iter/s)": 0.041129 }, { "clip_ratio": 0.0, "completion_length": 29.25, "epoch": 0.6767676767676768, "grad_norm": 10.536828994750977, "kl": 0.0798187255859375, "learning_rate": 2e-07, "loss": 0.02544976770877838, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2583333380520344, "reward_std": 0.16451202929019929, "rewards/MultiModalAccuracyORM": 0.2583333380520344, "step": 1675, "train_speed(iter/s)": 0.041135 }, { "clip_ratio": 0.0, "completion_length": 52.3, "epoch": 0.6787878787878788, "grad_norm": 5.117887020111084, "kl": 0.02090301513671875, "learning_rate": 2e-07, "loss": 0.04579094052314758, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.30000000447034836, "reward_std": 0.31495430171489713, "rewards/MultiModalAccuracyORM": 0.30000000447034836, "step": 1680, "train_speed(iter/s)": 0.041133 }, { "clip_ratio": 0.0, "completion_length": 11.7, "epoch": 0.6808080808080809, "grad_norm": 8.01219367980957, "kl": 0.1265289306640625, "learning_rate": 2e-07, "loss": 0.019950807094573975, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.4333333380520344, "reward_std": 0.16852192878723143, "rewards/MultiModalAccuracyORM": 0.4333333380520344, "step": 1685, "train_speed(iter/s)": 0.041146 }, { "clip_ratio": 0.0, "completion_length": 16.9, "epoch": 0.6828282828282828, "grad_norm": 7.546853065490723, "kl": 0.0402618408203125, "learning_rate": 2e-07, "loss": 0.030116382241249084, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.42500000447034836, "reward_std": 0.22629254460334777, "rewards/MultiModalAccuracyORM": 0.42500000447034836, "step": 1690, "train_speed(iter/s)": 0.041159 }, { "clip_ratio": 0.0, "completion_length": 47.2, "epoch": 0.6848484848484848, "grad_norm": 8.680946350097656, "kl": 0.1186279296875, "learning_rate": 2e-07, "loss": -0.014576731622219086, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.45833333507180213, "reward_std": 0.2074824631214142, "rewards/MultiModalAccuracyORM": 0.45833333507180213, "step": 1695, "train_speed(iter/s)": 0.041143 }, { "clip_ratio": 0.0, "completion_length": 10.65, "epoch": 0.6868686868686869, "grad_norm": 33.545352935791016, "kl": 0.11261825561523438, "learning_rate": 2e-07, "loss": 0.004046386480331421, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3250000014901161, "reward_std": 0.18561154305934907, "rewards/MultiModalAccuracyORM": 0.3250000014901161, "step": 1700, "train_speed(iter/s)": 0.041159 }, { "clip_ratio": 0.0, "completion_length": 69.85, "epoch": 0.6888888888888889, "grad_norm": 13.335136413574219, "kl": 0.11529541015625, "learning_rate": 2e-07, "loss": 0.0011761213652789592, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3166666701436043, "reward_std": 0.18482151627540588, "rewards/MultiModalAccuracyORM": 0.3166666701436043, "step": 1705, "train_speed(iter/s)": 0.041157 }, { "clip_ratio": 0.0, "completion_length": 21.45, "epoch": 0.6909090909090909, "grad_norm": 14.620392799377441, "kl": 0.07541313171386718, "learning_rate": 2e-07, "loss": 0.01065676361322403, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.1916666716337204, "reward_std": 0.24961273670196532, "rewards/MultiModalAccuracyORM": 0.1916666716337204, "step": 1710, "train_speed(iter/s)": 0.041164 }, { "clip_ratio": 0.0, "completion_length": 19.2, "epoch": 0.692929292929293, "grad_norm": 1.2891874313354492, "kl": 0.13163909912109376, "learning_rate": 2e-07, "loss": 0.02046767473220825, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.4916666731238365, "reward_std": 0.21374863088130952, "rewards/MultiModalAccuracyORM": 0.4916666731238365, "step": 1715, "train_speed(iter/s)": 0.041157 }, { "clip_ratio": 0.0, "completion_length": 11.4, "epoch": 0.694949494949495, "grad_norm": 3.101806879043579, "kl": 0.22337646484375, "learning_rate": 2e-07, "loss": 0.008609502017498017, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.37500001341104505, "reward_std": 0.3003751873970032, "rewards/MultiModalAccuracyORM": 0.37500001341104505, "step": 1720, "train_speed(iter/s)": 0.041168 }, { "clip_ratio": 0.0, "completion_length": 16.2, "epoch": 0.696969696969697, "grad_norm": 17.069448471069336, "kl": 0.10420684814453125, "learning_rate": 2e-07, "loss": -0.020038720965385438, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.32500000298023224, "reward_std": 0.14433756470680237, "rewards/MultiModalAccuracyORM": 0.32500000298023224, "step": 1725, "train_speed(iter/s)": 0.041178 }, { "clip_ratio": 0.0, "completion_length": 24.0, "epoch": 0.6989898989898989, "grad_norm": 2.795525074005127, "kl": 0.0689239501953125, "learning_rate": 2e-07, "loss": 0.022227957844734192, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.22500000074505805, "reward_std": 0.15824586153030396, "rewards/MultiModalAccuracyORM": 0.22500000074505805, "step": 1730, "train_speed(iter/s)": 0.041179 }, { "clip_ratio": 0.0, "completion_length": 63.75, "epoch": 0.701010101010101, "grad_norm": 2.3581957817077637, "kl": 0.04788818359375, "learning_rate": 2e-07, "loss": 0.033317530155181886, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.35000000298023226, "reward_std": 0.2837563753128052, "rewards/MultiModalAccuracyORM": 0.35000000298023226, "step": 1735, "train_speed(iter/s)": 0.04118 }, { "clip_ratio": 0.0, "completion_length": 10.5, "epoch": 0.703030303030303, "grad_norm": 2.782379627227783, "kl": 0.080255126953125, "learning_rate": 2e-07, "loss": -0.012095755338668824, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.18333333879709243, "reward_std": 0.28154108822345736, "rewards/MultiModalAccuracyORM": 0.18333333879709243, "step": 1740, "train_speed(iter/s)": 0.041192 }, { "clip_ratio": 0.0, "completion_length": 17.2, "epoch": 0.705050505050505, "grad_norm": 3.129946708679199, "kl": 0.04556884765625, "learning_rate": 2e-07, "loss": 0.037814974784851074, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.45000000223517417, "reward_std": 0.21378422081470488, "rewards/MultiModalAccuracyORM": 0.45000000223517417, "step": 1745, "train_speed(iter/s)": 0.041197 }, { "epoch": 0.7070707070707071, "grad_norm": 2.4902050495147705, "learning_rate": 2e-07, "loss": 0.0172103151679039, "memory(GiB)": 104.49, "step": 1750, "train_speed(iter/s)": 0.041202 }, { "epoch": 0.7070707070707071, "eval_clip_ratio": 0.0, "eval_completion_length": 34.29833379745483, "eval_kl": 0.10184234619140625, "eval_loss": 0.012326983734965324, "eval_response_clip_ratio": 0.0, "eval_reward": 0.4183333376049995, "eval_reward_std": 0.1789151507616043, "eval_rewards/MultiModalAccuracyORM": 0.4183333376049995, "eval_runtime": 267.6806, "eval_samples_per_second": 0.187, "eval_steps_per_second": 0.019, "step": 1750 }, { "clip_ratio": 0.0, "completion_length": 31.0, "epoch": 0.7090909090909091, "grad_norm": 14.173089981079102, "kl": 0.10649490356445312, "learning_rate": 2e-07, "loss": 0.007458774745464325, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.28750000670552256, "reward_std": 0.23671061247587205, "rewards/MultiModalAccuracyORM": 0.28750000670552256, "step": 1755, "train_speed(iter/s)": 0.040873 }, { "clip_ratio": 0.0, "completion_length": 19.6, "epoch": 0.7111111111111111, "grad_norm": 2.1408114433288574, "kl": 0.066253662109375, "learning_rate": 2e-07, "loss": 0.027722400426864625, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.16666667237877847, "reward_std": 0.2669951319694519, "rewards/MultiModalAccuracyORM": 0.16666667237877847, "step": 1760, "train_speed(iter/s)": 0.040871 }, { "clip_ratio": 0.0, "completion_length": 22.25, "epoch": 0.7131313131313132, "grad_norm": 24.069496154785156, "kl": 0.0887176513671875, "learning_rate": 2e-07, "loss": 0.00502915009856224, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2583333395421505, "reward_std": 0.18407654762268066, "rewards/MultiModalAccuracyORM": 0.2583333395421505, "step": 1765, "train_speed(iter/s)": 0.040877 }, { "clip_ratio": 0.0, "completion_length": 26.2, "epoch": 0.7151515151515152, "grad_norm": 2.3050827980041504, "kl": 0.2020782470703125, "learning_rate": 2e-07, "loss": 0.016819214820861815, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2750000022351742, "reward_std": 0.174764084815979, "rewards/MultiModalAccuracyORM": 0.2750000022351742, "step": 1770, "train_speed(iter/s)": 0.040888 }, { "clip_ratio": 0.0, "completion_length": 5.25, "epoch": 0.7171717171717171, "grad_norm": 8.913907051086426, "kl": 0.137213134765625, "learning_rate": 2e-07, "loss": -0.006190218776464462, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.5250000029802322, "reward_std": 0.22078081369400024, "rewards/MultiModalAccuracyORM": 0.5250000029802322, "step": 1775, "train_speed(iter/s)": 0.040903 }, { "clip_ratio": 0.0, "completion_length": 30.05, "epoch": 0.7191919191919192, "grad_norm": 2.8246963024139404, "kl": 0.11649169921875, "learning_rate": 2e-07, "loss": -0.06523974537849427, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.35833333507180215, "reward_std": 0.22629254460334777, "rewards/MultiModalAccuracyORM": 0.35833333507180215, "step": 1780, "train_speed(iter/s)": 0.040913 }, { "clip_ratio": 0.0, "completion_length": 25.3, "epoch": 0.7212121212121212, "grad_norm": 7.319549083709717, "kl": 0.100701904296875, "learning_rate": 2e-07, "loss": 0.03789505362510681, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2750000149011612, "reward_std": 0.2712534427642822, "rewards/MultiModalAccuracyORM": 0.2750000149011612, "step": 1785, "train_speed(iter/s)": 0.040921 }, { "clip_ratio": 0.0, "completion_length": 47.75, "epoch": 0.7232323232323232, "grad_norm": 8.2145357131958, "kl": 0.13018798828125, "learning_rate": 2e-07, "loss": -0.021410945057868957, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3916666738688946, "reward_std": 0.2325587123632431, "rewards/MultiModalAccuracyORM": 0.3916666738688946, "step": 1790, "train_speed(iter/s)": 0.040929 }, { "clip_ratio": 0.0, "completion_length": 8.7, "epoch": 0.7252525252525253, "grad_norm": 8.516419410705566, "kl": 0.1542633056640625, "learning_rate": 2e-07, "loss": 0.02146460711956024, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3000000059604645, "reward_std": 0.21823472976684571, "rewards/MultiModalAccuracyORM": 0.3000000059604645, "step": 1795, "train_speed(iter/s)": 0.040941 }, { "clip_ratio": 0.0, "completion_length": 8.8, "epoch": 0.7272727272727273, "grad_norm": 10.487430572509766, "kl": 0.2330535888671875, "learning_rate": 2e-07, "loss": 0.03371854722499847, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.37500000521540644, "reward_std": 0.16925235390663146, "rewards/MultiModalAccuracyORM": 0.37500000521540644, "step": 1800, "train_speed(iter/s)": 0.040952 }, { "clip_ratio": 0.0, "completion_length": 10.9, "epoch": 0.7292929292929293, "grad_norm": 2.5021793842315674, "kl": 0.053016281127929686, "learning_rate": 2e-07, "loss": -0.005027930065989494, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.358333333581686, "reward_std": 0.1193209171295166, "rewards/MultiModalAccuracyORM": 0.358333333581686, "step": 1805, "train_speed(iter/s)": 0.040965 }, { "clip_ratio": 0.0, "completion_length": 16.3, "epoch": 0.7313131313131314, "grad_norm": 9.409316062927246, "kl": 0.077154541015625, "learning_rate": 2e-07, "loss": 0.00013190507888793945, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3083333343267441, "reward_std": 0.14188667237758637, "rewards/MultiModalAccuracyORM": 0.3083333343267441, "step": 1810, "train_speed(iter/s)": 0.040974 }, { "clip_ratio": 0.0, "completion_length": 13.2, "epoch": 0.7333333333333333, "grad_norm": 8.413249015808105, "kl": 0.06329345703125, "learning_rate": 2e-07, "loss": 0.0067844375967979435, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3833333417773247, "reward_std": 0.2878072619438171, "rewards/MultiModalAccuracyORM": 0.3833333417773247, "step": 1815, "train_speed(iter/s)": 0.040988 }, { "clip_ratio": 0.0, "completion_length": 42.4, "epoch": 0.7353535353535353, "grad_norm": 3.3386476039886475, "kl": 0.0814666748046875, "learning_rate": 2e-07, "loss": 0.020126067101955414, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.37500000596046446, "reward_std": 0.18087121844291687, "rewards/MultiModalAccuracyORM": 0.37500000596046446, "step": 1820, "train_speed(iter/s)": 0.041 }, { "clip_ratio": 0.0, "completion_length": 18.6, "epoch": 0.7373737373737373, "grad_norm": 11.123106956481934, "kl": 0.13977203369140626, "learning_rate": 2e-07, "loss": 0.0059658966958522795, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2666666716337204, "reward_std": 0.15821027159690856, "rewards/MultiModalAccuracyORM": 0.2666666716337204, "step": 1825, "train_speed(iter/s)": 0.041002 }, { "clip_ratio": 0.0, "completion_length": 23.75, "epoch": 0.7393939393939394, "grad_norm": 4.5245361328125, "kl": 0.098736572265625, "learning_rate": 2e-07, "loss": -0.024525515735149384, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2666666731238365, "reward_std": 0.20995736718177796, "rewards/MultiModalAccuracyORM": 0.2666666731238365, "step": 1830, "train_speed(iter/s)": 0.040989 }, { "clip_ratio": 0.0, "completion_length": 25.1, "epoch": 0.7414141414141414, "grad_norm": 0.7691475749015808, "kl": 0.0991119384765625, "learning_rate": 2e-07, "loss": 0.039085444808006284, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.4250000029802322, "reward_std": 0.12552748322486879, "rewards/MultiModalAccuracyORM": 0.4250000029802322, "step": 1835, "train_speed(iter/s)": 0.040998 }, { "clip_ratio": 0.0, "completion_length": 19.0, "epoch": 0.7434343434343434, "grad_norm": 0.2410029023885727, "kl": 0.17838897705078124, "learning_rate": 2e-07, "loss": 0.04514871537685394, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.6166666708886623, "reward_std": 0.13258367776870728, "rewards/MultiModalAccuracyORM": 0.6166666708886623, "step": 1840, "train_speed(iter/s)": 0.040995 }, { "clip_ratio": 0.0, "completion_length": 11.9, "epoch": 0.7454545454545455, "grad_norm": 12.146939277648926, "kl": 0.097296142578125, "learning_rate": 2e-07, "loss": 0.02126455307006836, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.20833333656191827, "reward_std": 0.17657731771469115, "rewards/MultiModalAccuracyORM": 0.20833333656191827, "step": 1845, "train_speed(iter/s)": 0.041 }, { "clip_ratio": 0.0, "completion_length": 10.0, "epoch": 0.7474747474747475, "grad_norm": 10.014187812805176, "kl": 0.12047119140625, "learning_rate": 2e-07, "loss": 0.0045259218662977215, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.46666667237877846, "reward_std": 0.25897533297538755, "rewards/MultiModalAccuracyORM": 0.46666667237877846, "step": 1850, "train_speed(iter/s)": 0.041019 }, { "clip_ratio": 0.0, "completion_length": 9.3, "epoch": 0.7494949494949495, "grad_norm": 0.34578633308410645, "kl": 0.13382987976074218, "learning_rate": 2e-07, "loss": 0.003971926495432853, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.1833333358168602, "reward_std": 0.1356445223093033, "rewards/MultiModalAccuracyORM": 0.1833333358168602, "step": 1855, "train_speed(iter/s)": 0.041027 }, { "clip_ratio": 0.0, "completion_length": 19.05, "epoch": 0.7515151515151515, "grad_norm": 17.808372497558594, "kl": 0.025757217407226564, "learning_rate": 2e-07, "loss": 0.035965240001678465, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.28333334252238274, "reward_std": 0.19713521599769593, "rewards/MultiModalAccuracyORM": 0.28333334252238274, "step": 1860, "train_speed(iter/s)": 0.041022 }, { "clip_ratio": 0.0, "completion_length": 9.8, "epoch": 0.7535353535353535, "grad_norm": 24.15494155883789, "kl": 0.0437255859375, "learning_rate": 2e-07, "loss": -0.06361854076385498, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.4000000096857548, "reward_std": 0.36670139729976653, "rewards/MultiModalAccuracyORM": 0.4000000096857548, "step": 1865, "train_speed(iter/s)": 0.041031 }, { "clip_ratio": 0.0, "completion_length": 8.85, "epoch": 0.7555555555555555, "grad_norm": 80.81800079345703, "kl": 0.08274688720703124, "learning_rate": 2e-07, "loss": 0.003989287465810776, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3083333395421505, "reward_std": 0.15846449732780457, "rewards/MultiModalAccuracyORM": 0.3083333395421505, "step": 1870, "train_speed(iter/s)": 0.041038 }, { "clip_ratio": 0.0, "completion_length": 12.5, "epoch": 0.7575757575757576, "grad_norm": 14.617817878723145, "kl": 0.090728759765625, "learning_rate": 2e-07, "loss": -0.0045210480690002445, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2666666731238365, "reward_std": 0.19337954819202424, "rewards/MultiModalAccuracyORM": 0.2666666731238365, "step": 1875, "train_speed(iter/s)": 0.041048 }, { "clip_ratio": 0.0, "completion_length": 6.65, "epoch": 0.7595959595959596, "grad_norm": 13.89445972442627, "kl": 0.13492431640625, "learning_rate": 2e-07, "loss": 0.012078547477722168, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.20833333805203438, "reward_std": 0.12552748322486879, "rewards/MultiModalAccuracyORM": 0.20833333805203438, "step": 1880, "train_speed(iter/s)": 0.041062 }, { "clip_ratio": 0.0, "completion_length": 61.05, "epoch": 0.7616161616161616, "grad_norm": 11.715389251708984, "kl": 0.1376861572265625, "learning_rate": 2e-07, "loss": -0.014951804280281067, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.40000000298023225, "reward_std": 0.19337954223155976, "rewards/MultiModalAccuracyORM": 0.40000000298023225, "step": 1885, "train_speed(iter/s)": 0.041063 }, { "clip_ratio": 0.0, "completion_length": 15.4, "epoch": 0.7636363636363637, "grad_norm": 0.07281157374382019, "kl": 0.095611572265625, "learning_rate": 2e-07, "loss": 0.012891271710395813, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.483333333581686, "reward_std": 0.12631751000881195, "rewards/MultiModalAccuracyORM": 0.483333333581686, "step": 1890, "train_speed(iter/s)": 0.041073 }, { "clip_ratio": 0.0, "completion_length": 57.6, "epoch": 0.7656565656565657, "grad_norm": 1.9145233631134033, "kl": 0.19044036865234376, "learning_rate": 2e-07, "loss": -0.03062499463558197, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.35000000447034835, "reward_std": 0.24490800201892854, "rewards/MultiModalAccuracyORM": 0.35000000447034835, "step": 1895, "train_speed(iter/s)": 0.041073 }, { "clip_ratio": 0.0, "completion_length": 15.8, "epoch": 0.7676767676767676, "grad_norm": 22.877309799194336, "kl": 0.161077880859375, "learning_rate": 2e-07, "loss": 0.008297159522771835, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.27500000447034834, "reward_std": 0.09041781425476074, "rewards/MultiModalAccuracyORM": 0.27500000447034834, "step": 1900, "train_speed(iter/s)": 0.041078 }, { "clip_ratio": 0.0, "completion_length": 13.0, "epoch": 0.7696969696969697, "grad_norm": 21.666425704956055, "kl": 0.1980316162109375, "learning_rate": 2e-07, "loss": 0.020768019556999206, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.46666666865348816, "reward_std": 0.24114990234375, "rewards/MultiModalAccuracyORM": 0.46666666865348816, "step": 1905, "train_speed(iter/s)": 0.041093 }, { "clip_ratio": 0.0, "completion_length": 8.25, "epoch": 0.7717171717171717, "grad_norm": 22.925674438476562, "kl": 0.0932861328125, "learning_rate": 2e-07, "loss": 0.009479768574237823, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.30833334028720855, "reward_std": 0.2466856449842453, "rewards/MultiModalAccuracyORM": 0.30833334028720855, "step": 1910, "train_speed(iter/s)": 0.0411 }, { "clip_ratio": 0.0, "completion_length": 8.65, "epoch": 0.7737373737373737, "grad_norm": 0.14844609797000885, "kl": 0.232122802734375, "learning_rate": 2e-07, "loss": 0.010550656914710998, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.6166666686534882, "reward_std": 0.16454761922359468, "rewards/MultiModalAccuracyORM": 0.6166666686534882, "step": 1915, "train_speed(iter/s)": 0.041111 }, { "clip_ratio": 0.0, "completion_length": 29.85, "epoch": 0.7757575757575758, "grad_norm": 13.482421875, "kl": 0.120068359375, "learning_rate": 2e-07, "loss": 0.022914706170558928, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.4500000044703484, "reward_std": 0.25292780101299284, "rewards/MultiModalAccuracyORM": 0.4500000044703484, "step": 1920, "train_speed(iter/s)": 0.041122 }, { "clip_ratio": 0.0, "completion_length": 8.7, "epoch": 0.7777777777777778, "grad_norm": 0.19085177779197693, "kl": 0.14432373046875, "learning_rate": 2e-07, "loss": 0.020079278945922853, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.5416666671633721, "reward_std": 0.18859823644161225, "rewards/MultiModalAccuracyORM": 0.5416666671633721, "step": 1925, "train_speed(iter/s)": 0.04113 }, { "clip_ratio": 0.0, "completion_length": 57.35, "epoch": 0.7797979797979798, "grad_norm": 0.04123455658555031, "kl": 0.10629119873046874, "learning_rate": 2e-07, "loss": 0.02534767985343933, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.4833333358168602, "reward_std": 0.1652424544095993, "rewards/MultiModalAccuracyORM": 0.4833333358168602, "step": 1930, "train_speed(iter/s)": 0.041128 }, { "clip_ratio": 0.0, "completion_length": 13.9, "epoch": 0.7818181818181819, "grad_norm": 7.716069221496582, "kl": 0.03204345703125, "learning_rate": 2e-07, "loss": 0.018103978037834166, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.32500000596046447, "reward_std": 0.3275222271680832, "rewards/MultiModalAccuracyORM": 0.32500000596046447, "step": 1935, "train_speed(iter/s)": 0.041139 }, { "clip_ratio": 0.0, "completion_length": 15.0, "epoch": 0.7838383838383839, "grad_norm": 1.998159408569336, "kl": 0.2424346923828125, "learning_rate": 2e-07, "loss": -0.0022819479927420616, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3416666738688946, "reward_std": 0.2526735752820969, "rewards/MultiModalAccuracyORM": 0.3416666738688946, "step": 1940, "train_speed(iter/s)": 0.041144 }, { "clip_ratio": 0.0, "completion_length": 8.25, "epoch": 0.7858585858585858, "grad_norm": 0.11755078285932541, "kl": 0.1235809326171875, "learning_rate": 2e-07, "loss": 0.01756092607975006, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.4083333358168602, "reward_std": 0.16145119071006775, "rewards/MultiModalAccuracyORM": 0.4083333358168602, "step": 1945, "train_speed(iter/s)": 0.041156 }, { "clip_ratio": 0.0, "completion_length": 29.45, "epoch": 0.7878787878787878, "grad_norm": 11.287028312683105, "kl": 0.05250396728515625, "learning_rate": 2e-07, "loss": -0.009032456576824189, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.36666667386889457, "reward_std": 0.30639870166778566, "rewards/MultiModalAccuracyORM": 0.36666667386889457, "step": 1950, "train_speed(iter/s)": 0.041159 }, { "clip_ratio": 0.0, "completion_length": 7.6, "epoch": 0.7898989898989899, "grad_norm": 0.1284160166978836, "kl": 0.046563720703125, "learning_rate": 2e-07, "loss": 0.0006015380378812552, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.6250000067055226, "reward_std": 0.1973894417285919, "rewards/MultiModalAccuracyORM": 0.6250000067055226, "step": 1955, "train_speed(iter/s)": 0.041172 }, { "clip_ratio": 0.0, "completion_length": 20.05, "epoch": 0.7919191919191919, "grad_norm": 0.5048889517784119, "kl": 0.0877197265625, "learning_rate": 2e-07, "loss": 0.0017469068989157677, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.125, "reward_std": 0.045226702094078065, "rewards/MultiModalAccuracyORM": 0.125, "step": 1960, "train_speed(iter/s)": 0.041177 }, { "clip_ratio": 0.0, "completion_length": 23.7, "epoch": 0.793939393939394, "grad_norm": 10.217628479003906, "kl": 0.1369842529296875, "learning_rate": 2e-07, "loss": -0.007052314281463623, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.4083333395421505, "reward_std": 0.19968129992485045, "rewards/MultiModalAccuracyORM": 0.4083333395421505, "step": 1965, "train_speed(iter/s)": 0.041181 }, { "clip_ratio": 0.0, "completion_length": 31.1, "epoch": 0.795959595959596, "grad_norm": 15.147607803344727, "kl": 0.139697265625, "learning_rate": 2e-07, "loss": -0.0005793333053588867, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.30000000521540643, "reward_std": 0.26928699016571045, "rewards/MultiModalAccuracyORM": 0.30000000521540643, "step": 1970, "train_speed(iter/s)": 0.041179 }, { "clip_ratio": 0.0, "completion_length": 17.05, "epoch": 0.797979797979798, "grad_norm": 14.508552551269531, "kl": 0.1334228515625, "learning_rate": 2e-07, "loss": 0.014681649208068848, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.30000000447034836, "reward_std": 0.31593895256519317, "rewards/MultiModalAccuracyORM": 0.30000000447034836, "step": 1975, "train_speed(iter/s)": 0.041186 }, { "clip_ratio": 0.0, "completion_length": 11.4, "epoch": 0.8, "grad_norm": 14.245569229125977, "kl": 0.07449951171875, "learning_rate": 2e-07, "loss": 0.019247731566429137, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.23333334252238275, "reward_std": 0.256683474779129, "rewards/MultiModalAccuracyORM": 0.23333334252238275, "step": 1980, "train_speed(iter/s)": 0.041204 }, { "clip_ratio": 0.0, "completion_length": 19.0, "epoch": 0.802020202020202, "grad_norm": 0.06112133339047432, "kl": 0.09664306640625, "learning_rate": 2e-07, "loss": -0.010070499032735825, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.45000000223517417, "reward_std": 0.17555411159992218, "rewards/MultiModalAccuracyORM": 0.45000000223517417, "step": 1985, "train_speed(iter/s)": 0.04121 }, { "clip_ratio": 0.0, "completion_length": 51.2, "epoch": 0.804040404040404, "grad_norm": 0.20859137177467346, "kl": 0.2631103515625, "learning_rate": 2e-07, "loss": -0.03446192741394043, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2666666679084301, "reward_std": 0.15194410383701323, "rewards/MultiModalAccuracyORM": 0.2666666679084301, "step": 1990, "train_speed(iter/s)": 0.041215 }, { "clip_ratio": 0.0, "completion_length": 14.6, "epoch": 0.806060606060606, "grad_norm": 2.347874879837036, "kl": 0.09171142578125, "learning_rate": 2e-07, "loss": 0.003209712356328964, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.4833333395421505, "reward_std": 0.1770799547433853, "rewards/MultiModalAccuracyORM": 0.4833333395421505, "step": 1995, "train_speed(iter/s)": 0.041215 }, { "epoch": 0.8080808080808081, "grad_norm": 12.103494644165039, "learning_rate": 2e-07, "loss": 0.051232755184173584, "memory(GiB)": 104.49, "step": 2000, "train_speed(iter/s)": 0.041214 }, { "epoch": 0.8080808080808081, "eval_clip_ratio": 0.0, "eval_completion_length": 32.68000123023987, "eval_kl": 0.1109576416015625, "eval_loss": 0.001846806495450437, "eval_response_clip_ratio": 0.0, "eval_reward": 0.4066666714847088, "eval_reward_std": 0.1827806031703949, "eval_rewards/MultiModalAccuracyORM": 0.4066666714847088, "eval_runtime": 274.3294, "eval_samples_per_second": 0.182, "eval_steps_per_second": 0.018, "step": 2000 }, { "clip_ratio": 0.0, "completion_length": 17.275, "epoch": 0.8101010101010101, "grad_norm": 20.72494888305664, "kl": 0.09075469970703125, "learning_rate": 2e-07, "loss": 0.01332613080739975, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.21666667200624942, "reward_std": 0.22227564305067063, "rewards/MultiModalAccuracyORM": 0.21666667200624942, "step": 2005, "train_speed(iter/s)": 0.04093 }, { "clip_ratio": 0.0, "completion_length": 15.2, "epoch": 0.8121212121212121, "grad_norm": 10.545307159423828, "kl": 0.157220458984375, "learning_rate": 2e-07, "loss": 0.02192305028438568, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2666666716337204, "reward_std": 0.21999078691005708, "rewards/MultiModalAccuracyORM": 0.2666666716337204, "step": 2010, "train_speed(iter/s)": 0.040938 }, { "clip_ratio": 0.0, "completion_length": 7.65, "epoch": 0.8141414141414142, "grad_norm": 0.1491260975599289, "kl": 0.1144989013671875, "learning_rate": 2e-07, "loss": 0.021004287898540495, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.40000000298023225, "reward_std": 0.17456946671009063, "rewards/MultiModalAccuracyORM": 0.40000000298023225, "step": 2015, "train_speed(iter/s)": 0.040944 }, { "clip_ratio": 0.0, "completion_length": 21.6, "epoch": 0.8161616161616162, "grad_norm": 19.212770462036133, "kl": 0.0832275390625, "learning_rate": 2e-07, "loss": 0.004856839030981064, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.27500001043081285, "reward_std": 0.2800416827201843, "rewards/MultiModalAccuracyORM": 0.27500001043081285, "step": 2020, "train_speed(iter/s)": 0.040949 }, { "clip_ratio": 0.0, "completion_length": 20.2, "epoch": 0.8181818181818182, "grad_norm": 0.25410985946655273, "kl": 0.129962158203125, "learning_rate": 2e-07, "loss": 0.016422802209854127, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.5250000059604645, "reward_std": 0.09041781425476074, "rewards/MultiModalAccuracyORM": 0.5250000059604645, "step": 2025, "train_speed(iter/s)": 0.040961 }, { "clip_ratio": 0.0, "completion_length": 8.25, "epoch": 0.8202020202020202, "grad_norm": 6.931528568267822, "kl": 0.260528564453125, "learning_rate": 2e-07, "loss": -0.02277086079120636, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.5583333410322666, "reward_std": 0.2526735752820969, "rewards/MultiModalAccuracyORM": 0.5583333410322666, "step": 2030, "train_speed(iter/s)": 0.040965 }, { "clip_ratio": 0.0, "completion_length": 15.7, "epoch": 0.8222222222222222, "grad_norm": 27.311315536499023, "kl": 0.07995872497558594, "learning_rate": 2e-07, "loss": 0.024982047080993653, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3416666753590107, "reward_std": 0.3019101768732071, "rewards/MultiModalAccuracyORM": 0.3416666753590107, "step": 2035, "train_speed(iter/s)": 0.040969 }, { "clip_ratio": 0.0, "completion_length": 25.5, "epoch": 0.8242424242424242, "grad_norm": 0.08455629646778107, "kl": 0.1283721923828125, "learning_rate": 2e-07, "loss": 0.007968991994857788, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2083333395421505, "reward_std": 0.2167353242635727, "rewards/MultiModalAccuracyORM": 0.2083333395421505, "step": 2040, "train_speed(iter/s)": 0.040978 }, { "clip_ratio": 0.0, "completion_length": 17.7, "epoch": 0.8262626262626263, "grad_norm": 0.012692108750343323, "kl": 0.06329593658447266, "learning_rate": 2e-07, "loss": 0.019880211353302, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.4750000022351742, "reward_std": 0.1559540092945099, "rewards/MultiModalAccuracyORM": 0.4750000022351742, "step": 2045, "train_speed(iter/s)": 0.040984 }, { "clip_ratio": 0.0, "completion_length": 18.25, "epoch": 0.8282828282828283, "grad_norm": 0.49161991477012634, "kl": 0.041827392578125, "learning_rate": 2e-07, "loss": 0.023220118880271912, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2583333358168602, "reward_std": 0.13882583379745483, "rewards/MultiModalAccuracyORM": 0.2583333358168602, "step": 2050, "train_speed(iter/s)": 0.040982 }, { "clip_ratio": 0.0, "completion_length": 5.25, "epoch": 0.8303030303030303, "grad_norm": 3.920830249786377, "kl": 0.130963134765625, "learning_rate": 2e-07, "loss": 0.012984590232372284, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3666666753590107, "reward_std": 0.3127244710922241, "rewards/MultiModalAccuracyORM": 0.3666666753590107, "step": 2055, "train_speed(iter/s)": 0.040988 }, { "clip_ratio": 0.0, "completion_length": 13.95, "epoch": 0.8323232323232324, "grad_norm": 2.618926763534546, "kl": 0.0820068359375, "learning_rate": 2e-07, "loss": -0.0011547774076461792, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3166666738688946, "reward_std": 0.174509859085083, "rewards/MultiModalAccuracyORM": 0.3166666738688946, "step": 2060, "train_speed(iter/s)": 0.040997 }, { "clip_ratio": 0.0, "completion_length": 4.05, "epoch": 0.8343434343434344, "grad_norm": 21.554759979248047, "kl": 0.2670654296875, "learning_rate": 2e-07, "loss": 0.008714067935943603, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2750000059604645, "reward_std": 0.22781596183776856, "rewards/MultiModalAccuracyORM": 0.2750000059604645, "step": 2065, "train_speed(iter/s)": 0.041007 }, { "clip_ratio": 0.0, "completion_length": 24.4, "epoch": 0.8363636363636363, "grad_norm": 0.038795698434114456, "kl": 0.09162445068359375, "learning_rate": 2e-07, "loss": 0.01877760738134384, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2416666679084301, "reward_std": 0.12552748322486879, "rewards/MultiModalAccuracyORM": 0.2416666679084301, "step": 2070, "train_speed(iter/s)": 0.041016 }, { "clip_ratio": 0.0, "completion_length": 24.65, "epoch": 0.8383838383838383, "grad_norm": 0.5922779440879822, "kl": 0.17679443359375, "learning_rate": 2e-07, "loss": 0.007905527949333191, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.5, "reward_std": 0.0, "rewards/MultiModalAccuracyORM": 0.5, "step": 2075, "train_speed(iter/s)": 0.041029 }, { "clip_ratio": 0.0, "completion_length": 7.05, "epoch": 0.8404040404040404, "grad_norm": 0.48757824301719666, "kl": 0.1322998046875, "learning_rate": 2e-07, "loss": 0.006768345832824707, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.5333333358168602, "reward_std": 0.1356445163488388, "rewards/MultiModalAccuracyORM": 0.5333333358168602, "step": 2080, "train_speed(iter/s)": 0.04104 }, { "clip_ratio": 0.0, "completion_length": 8.2, "epoch": 0.8424242424242424, "grad_norm": 7.100019931793213, "kl": 0.09602890014648438, "learning_rate": 2e-07, "loss": -0.010533835738897324, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.25833334103226663, "reward_std": 0.22001479864120482, "rewards/MultiModalAccuracyORM": 0.25833334103226663, "step": 2085, "train_speed(iter/s)": 0.041047 }, { "clip_ratio": 0.0, "completion_length": 9.15, "epoch": 0.8444444444444444, "grad_norm": 10.953103065490723, "kl": 0.205523681640625, "learning_rate": 2e-07, "loss": 0.07547287940979004, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.23333333805203438, "reward_std": 0.16852193474769592, "rewards/MultiModalAccuracyORM": 0.23333333805203438, "step": 2090, "train_speed(iter/s)": 0.041052 }, { "clip_ratio": 0.0, "completion_length": 6.75, "epoch": 0.8464646464646465, "grad_norm": 4.194830894470215, "kl": 0.0806396484375, "learning_rate": 2e-07, "loss": -0.017879560589790344, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3916666693985462, "reward_std": 0.11702905893325806, "rewards/MultiModalAccuracyORM": 0.3916666693985462, "step": 2095, "train_speed(iter/s)": 0.041059 }, { "clip_ratio": 0.0, "completion_length": 23.75, "epoch": 0.8484848484848485, "grad_norm": 0.12948361039161682, "kl": 0.13734283447265624, "learning_rate": 2e-07, "loss": -0.01447494924068451, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.4750000111758709, "reward_std": 0.2338038980960846, "rewards/MultiModalAccuracyORM": 0.4750000111758709, "step": 2100, "train_speed(iter/s)": 0.041064 }, { "clip_ratio": 0.0, "completion_length": 22.05, "epoch": 0.8505050505050505, "grad_norm": 31.3735294342041, "kl": 0.177423095703125, "learning_rate": 2e-07, "loss": -0.0017697295174002648, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.25000000521540644, "reward_std": 0.243092343211174, "rewards/MultiModalAccuracyORM": 0.25000000521540644, "step": 2105, "train_speed(iter/s)": 0.041067 }, { "clip_ratio": 0.0, "completion_length": 4.65, "epoch": 0.8525252525252526, "grad_norm": 2.228029251098633, "kl": 0.14156494140625, "learning_rate": 2e-07, "loss": -0.010953420400619506, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.5166666679084301, "reward_std": 0.22297748625278474, "rewards/MultiModalAccuracyORM": 0.5166666679084301, "step": 2110, "train_speed(iter/s)": 0.041074 }, { "clip_ratio": 0.0, "completion_length": 47.35, "epoch": 0.8545454545454545, "grad_norm": 0.3235064446926117, "kl": 0.19440174102783203, "learning_rate": 2e-07, "loss": -0.010122859477996826, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2750000074505806, "reward_std": 0.1888910174369812, "rewards/MultiModalAccuracyORM": 0.2750000074505806, "step": 2115, "train_speed(iter/s)": 0.041074 }, { "clip_ratio": 0.0, "completion_length": 34.1, "epoch": 0.8565656565656565, "grad_norm": 9.72260856628418, "kl": 0.18918914794921876, "learning_rate": 2e-07, "loss": 0.0024737130850553514, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.45833334028720857, "reward_std": 0.16925235390663146, "rewards/MultiModalAccuracyORM": 0.45833334028720857, "step": 2120, "train_speed(iter/s)": 0.041072 }, { "clip_ratio": 0.0, "completion_length": 42.2, "epoch": 0.8585858585858586, "grad_norm": 9.817282676696777, "kl": 0.299951171875, "learning_rate": 2e-07, "loss": 0.00935778021812439, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.17500000074505806, "reward_std": 0.15824586153030396, "rewards/MultiModalAccuracyORM": 0.17500000074505806, "step": 2125, "train_speed(iter/s)": 0.041077 }, { "clip_ratio": 0.0, "completion_length": 7.6, "epoch": 0.8606060606060606, "grad_norm": 0.3442615568637848, "kl": 0.17735595703125, "learning_rate": 2e-07, "loss": 0.006759631633758545, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.48333334028720853, "reward_std": 0.2730426698923111, "rewards/MultiModalAccuracyORM": 0.48333334028720853, "step": 2130, "train_speed(iter/s)": 0.041087 }, { "clip_ratio": 0.0, "completion_length": 79.15, "epoch": 0.8626262626262626, "grad_norm": 0.520937979221344, "kl": 0.07093505859375, "learning_rate": 2e-07, "loss": -0.012908129394054413, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.20000001043081284, "reward_std": 0.1996457099914551, "rewards/MultiModalAccuracyORM": 0.20000001043081284, "step": 2135, "train_speed(iter/s)": 0.041077 }, { "clip_ratio": 0.0, "completion_length": 24.9, "epoch": 0.8646464646464647, "grad_norm": 1.4221155643463135, "kl": 0.132373046875, "learning_rate": 2e-07, "loss": -0.07007729411125183, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2916666716337204, "reward_std": 0.21422483026981354, "rewards/MultiModalAccuracyORM": 0.2916666716337204, "step": 2140, "train_speed(iter/s)": 0.041076 }, { "clip_ratio": 0.0, "completion_length": 34.4, "epoch": 0.8666666666666667, "grad_norm": 19.47251319885254, "kl": 0.0463134765625, "learning_rate": 2e-07, "loss": 0.02097744941711426, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3666666723787785, "reward_std": 0.25897533297538755, "rewards/MultiModalAccuracyORM": 0.3666666723787785, "step": 2145, "train_speed(iter/s)": 0.041079 }, { "clip_ratio": 0.0, "completion_length": 35.05, "epoch": 0.8686868686868687, "grad_norm": 0.0365481972694397, "kl": 0.07025909423828125, "learning_rate": 2e-07, "loss": -0.00900230035185814, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.5666666716337204, "reward_std": 0.1295969843864441, "rewards/MultiModalAccuracyORM": 0.5666666716337204, "step": 2150, "train_speed(iter/s)": 0.041076 }, { "clip_ratio": 0.0, "completion_length": 15.55, "epoch": 0.8707070707070707, "grad_norm": 3.220684051513672, "kl": 0.11529541015625, "learning_rate": 2e-07, "loss": 0.05271543264389038, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.23333334177732468, "reward_std": 0.3222196638584137, "rewards/MultiModalAccuracyORM": 0.23333334177732468, "step": 2155, "train_speed(iter/s)": 0.041085 }, { "clip_ratio": 0.0, "completion_length": 30.5, "epoch": 0.8727272727272727, "grad_norm": 21.94721031188965, "kl": 0.1160491943359375, "learning_rate": 2e-07, "loss": 0.0024079522117972374, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.20000000447034835, "reward_std": 0.24009110629558564, "rewards/MultiModalAccuracyORM": 0.20000000447034835, "step": 2160, "train_speed(iter/s)": 0.04109 }, { "clip_ratio": 0.0, "completion_length": 16.7, "epoch": 0.8747474747474747, "grad_norm": 20.038494110107422, "kl": 0.1658905029296875, "learning_rate": 2e-07, "loss": 0.04994232654571533, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.533333345502615, "reward_std": 0.325963220000267, "rewards/MultiModalAccuracyORM": 0.533333345502615, "step": 2165, "train_speed(iter/s)": 0.041101 }, { "clip_ratio": 0.0, "completion_length": 28.45, "epoch": 0.8767676767676768, "grad_norm": 2.1534128189086914, "kl": 0.0698028564453125, "learning_rate": 2e-07, "loss": -0.025438961386680604, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.21666667014360427, "reward_std": 0.21524804830551147, "rewards/MultiModalAccuracyORM": 0.21666667014360427, "step": 2170, "train_speed(iter/s)": 0.041105 }, { "clip_ratio": 0.0, "completion_length": 7.25, "epoch": 0.8787878787878788, "grad_norm": 6.415175437927246, "kl": 0.0926239013671875, "learning_rate": 2e-07, "loss": -0.007227879762649536, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.44166667610406873, "reward_std": 0.32300969064235685, "rewards/MultiModalAccuracyORM": 0.44166667610406873, "step": 2175, "train_speed(iter/s)": 0.041115 }, { "clip_ratio": 0.0, "completion_length": 7.3, "epoch": 0.8808080808080808, "grad_norm": 0.38973256945610046, "kl": 0.13404541015625, "learning_rate": 2e-07, "loss": 0.023914989829063416, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.4000000059604645, "reward_std": 0.1896214485168457, "rewards/MultiModalAccuracyORM": 0.4000000059604645, "step": 2180, "train_speed(iter/s)": 0.041124 }, { "clip_ratio": 0.0, "completion_length": 10.9, "epoch": 0.8828282828282829, "grad_norm": 0.12656661868095398, "kl": 0.15858612060546876, "learning_rate": 2e-07, "loss": 0.008176784217357635, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3416666679084301, "reward_std": 0.07810411453247071, "rewards/MultiModalAccuracyORM": 0.3416666679084301, "step": 2185, "train_speed(iter/s)": 0.041129 }, { "clip_ratio": 0.0, "completion_length": 61.4, "epoch": 0.8848484848484849, "grad_norm": 2.246829032897949, "kl": 0.05509033203125, "learning_rate": 2e-07, "loss": 0.0310079425573349, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3416666708886623, "reward_std": 0.22704698145389557, "rewards/MultiModalAccuracyORM": 0.3416666708886623, "step": 2190, "train_speed(iter/s)": 0.041118 }, { "clip_ratio": 0.0, "completion_length": 11.15, "epoch": 0.8868686868686869, "grad_norm": 0.3648838996887207, "kl": 0.1862060546875, "learning_rate": 2e-07, "loss": -0.014291207492351531, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.4416666813194752, "reward_std": 0.29006352424621584, "rewards/MultiModalAccuracyORM": 0.4416666813194752, "step": 2195, "train_speed(iter/s)": 0.041124 }, { "clip_ratio": 0.0, "completion_length": 11.0, "epoch": 0.8888888888888888, "grad_norm": 5.710547924041748, "kl": 0.1601806640625, "learning_rate": 2e-07, "loss": 0.0010113120079040527, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3416666708886623, "reward_std": 0.15824586153030396, "rewards/MultiModalAccuracyORM": 0.3416666708886623, "step": 2200, "train_speed(iter/s)": 0.041137 }, { "clip_ratio": 0.0, "completion_length": 47.85, "epoch": 0.8909090909090909, "grad_norm": 0.11420593410730362, "kl": 0.1681976318359375, "learning_rate": 2e-07, "loss": 0.0913887619972229, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.1083333358168602, "reward_std": 0.18262484967708587, "rewards/MultiModalAccuracyORM": 0.1083333358168602, "step": 2205, "train_speed(iter/s)": 0.041132 }, { "clip_ratio": 0.0, "completion_length": 28.25, "epoch": 0.8929292929292929, "grad_norm": 21.853090286254883, "kl": 0.10088920593261719, "learning_rate": 2e-07, "loss": 0.0005557646509259939, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.47500000819563865, "reward_std": 0.16451202929019929, "rewards/MultiModalAccuracyORM": 0.47500000819563865, "step": 2210, "train_speed(iter/s)": 0.041127 }, { "clip_ratio": 0.0, "completion_length": 34.95, "epoch": 0.8949494949494949, "grad_norm": 0.11827383190393448, "kl": 0.1541900634765625, "learning_rate": 2e-07, "loss": 0.0488810658454895, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3000000059604645, "reward_std": 0.22625695466995238, "rewards/MultiModalAccuracyORM": 0.3000000059604645, "step": 2215, "train_speed(iter/s)": 0.041123 }, { "clip_ratio": 0.0, "completion_length": 42.2, "epoch": 0.896969696969697, "grad_norm": 10.474591255187988, "kl": 0.10148773193359376, "learning_rate": 2e-07, "loss": -0.004365795105695724, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.25000000521540644, "reward_std": 0.25591449439525604, "rewards/MultiModalAccuracyORM": 0.25000000521540644, "step": 2220, "train_speed(iter/s)": 0.041126 }, { "clip_ratio": 0.0, "completion_length": 40.2, "epoch": 0.898989898989899, "grad_norm": 0.02211969904601574, "kl": 0.0304107666015625, "learning_rate": 2e-07, "loss": 0.0038854777812957764, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.233333333581686, "reward_std": 0.07409421503543853, "rewards/MultiModalAccuracyORM": 0.233333333581686, "step": 2225, "train_speed(iter/s)": 0.041135 }, { "clip_ratio": 0.0, "completion_length": 12.35, "epoch": 0.901010101010101, "grad_norm": 11.09273910522461, "kl": 0.1110137939453125, "learning_rate": 2e-07, "loss": 0.0425330251455307, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.1000000037252903, "reward_std": 0.16852192878723143, "rewards/MultiModalAccuracyORM": 0.1000000037252903, "step": 2230, "train_speed(iter/s)": 0.041144 }, { "clip_ratio": 0.0, "completion_length": 14.8, "epoch": 0.9030303030303031, "grad_norm": 17.634380340576172, "kl": 0.215167236328125, "learning_rate": 2e-07, "loss": 0.03751255869865418, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3833333358168602, "reward_std": 0.10697162747383118, "rewards/MultiModalAccuracyORM": 0.3833333358168602, "step": 2235, "train_speed(iter/s)": 0.041143 }, { "clip_ratio": 0.0, "completion_length": 6.8, "epoch": 0.9050505050505051, "grad_norm": 0.31089159846305847, "kl": 0.1969482421875, "learning_rate": 2e-07, "loss": 0.011410205066204071, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3416666738688946, "reward_std": 0.18108985424041749, "rewards/MultiModalAccuracyORM": 0.3416666738688946, "step": 2240, "train_speed(iter/s)": 0.041157 }, { "clip_ratio": 0.0, "completion_length": 22.1, "epoch": 0.907070707070707, "grad_norm": 0.033987369388341904, "kl": 0.1924041748046875, "learning_rate": 2e-07, "loss": 0.0015319785103201865, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.22500000298023223, "reward_std": 0.12552748322486879, "rewards/MultiModalAccuracyORM": 0.22500000298023223, "step": 2245, "train_speed(iter/s)": 0.041165 }, { "epoch": 0.9090909090909091, "grad_norm": 0.06531964987516403, "learning_rate": 2e-07, "loss": -0.01111970990896225, "memory(GiB)": 104.49, "step": 2250, "train_speed(iter/s)": 0.041175 }, { "epoch": 0.9090909090909091, "eval_clip_ratio": 0.0, "eval_completion_length": 33.406667890548704, "eval_kl": 0.133411865234375, "eval_loss": -0.00466223806142807, "eval_response_clip_ratio": 0.0, "eval_reward": 0.441666671782732, "eval_reward_std": 0.1628412437438965, "eval_rewards/MultiModalAccuracyORM": 0.441666671782732, "eval_runtime": 272.4154, "eval_samples_per_second": 0.184, "eval_steps_per_second": 0.018, "step": 2250 }, { "clip_ratio": 0.0, "completion_length": 23.525, "epoch": 0.9111111111111111, "grad_norm": 0.03232080861926079, "kl": 0.22264862060546875, "learning_rate": 2e-07, "loss": 0.028143799304962157, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.4166666720062494, "reward_std": 0.12746492475271226, "rewards/MultiModalAccuracyORM": 0.4166666720062494, "step": 2255, "train_speed(iter/s)": 0.040921 }, { "clip_ratio": 0.0, "completion_length": 11.4, "epoch": 0.9131313131313131, "grad_norm": 0.06567571312189102, "kl": 0.08049087524414063, "learning_rate": 2e-07, "loss": 0.031807747483253476, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.45833333507180213, "reward_std": 0.14564234614372254, "rewards/MultiModalAccuracyORM": 0.45833333507180213, "step": 2260, "train_speed(iter/s)": 0.040927 }, { "clip_ratio": 0.0, "completion_length": 21.6, "epoch": 0.9151515151515152, "grad_norm": 0.668204665184021, "kl": 0.1173919677734375, "learning_rate": 2e-07, "loss": -0.03886902332305908, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.28333333432674407, "reward_std": 0.16830329298973085, "rewards/MultiModalAccuracyORM": 0.28333333432674407, "step": 2265, "train_speed(iter/s)": 0.040936 }, { "clip_ratio": 0.0, "completion_length": 44.0, "epoch": 0.9171717171717172, "grad_norm": 0.16663120687007904, "kl": 0.1005615234375, "learning_rate": 2e-07, "loss": 0.011882781982421875, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3083333380520344, "reward_std": 0.2659719318151474, "rewards/MultiModalAccuracyORM": 0.3083333380520344, "step": 2270, "train_speed(iter/s)": 0.040942 }, { "clip_ratio": 0.0, "completion_length": 17.5, "epoch": 0.9191919191919192, "grad_norm": 18.440631866455078, "kl": 0.121307373046875, "learning_rate": 2e-07, "loss": 0.01434231996536255, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.33333333805203436, "reward_std": 0.17075418531894684, "rewards/MultiModalAccuracyORM": 0.33333333805203436, "step": 2275, "train_speed(iter/s)": 0.040956 }, { "clip_ratio": 0.0, "completion_length": 18.25, "epoch": 0.9212121212121213, "grad_norm": 30.9835147857666, "kl": 0.118048095703125, "learning_rate": 2e-07, "loss": 0.012100108712911607, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3583333373069763, "reward_std": 0.17781037986278533, "rewards/MultiModalAccuracyORM": 0.3583333373069763, "step": 2280, "train_speed(iter/s)": 0.040966 }, { "clip_ratio": 0.0, "completion_length": 13.3, "epoch": 0.9232323232323232, "grad_norm": 6.152209758758545, "kl": 0.3557861328125, "learning_rate": 2e-07, "loss": -0.025510752201080324, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.30000000074505806, "reward_std": 0.1652424544095993, "rewards/MultiModalAccuracyORM": 0.30000000074505806, "step": 2285, "train_speed(iter/s)": 0.040978 }, { "clip_ratio": 0.0, "completion_length": 47.3, "epoch": 0.9252525252525252, "grad_norm": 22.69240951538086, "kl": 0.155462646484375, "learning_rate": 2e-07, "loss": -0.0011336962692439557, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2916666716337204, "reward_std": 0.21422483026981354, "rewards/MultiModalAccuracyORM": 0.2916666716337204, "step": 2290, "train_speed(iter/s)": 0.040975 }, { "clip_ratio": 0.0, "completion_length": 9.15, "epoch": 0.9272727272727272, "grad_norm": 0.06437839567661285, "kl": 0.16920166015625, "learning_rate": 2e-07, "loss": 0.0063018262386322025, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.25833333730697633, "reward_std": 0.07810411453247071, "rewards/MultiModalAccuracyORM": 0.25833333730697633, "step": 2295, "train_speed(iter/s)": 0.040991 }, { "clip_ratio": 0.0, "completion_length": 56.8, "epoch": 0.9292929292929293, "grad_norm": 0.896676778793335, "kl": 0.12425537109375, "learning_rate": 2e-07, "loss": 0.01196231171488762, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2916666716337204, "reward_std": 0.29564401507377625, "rewards/MultiModalAccuracyORM": 0.2916666716337204, "step": 2300, "train_speed(iter/s)": 0.040994 }, { "clip_ratio": 0.0, "completion_length": 16.8, "epoch": 0.9313131313131313, "grad_norm": 1.9378466606140137, "kl": 0.067706298828125, "learning_rate": 2e-07, "loss": -0.021140041947364806, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.4333333410322666, "reward_std": 0.2504173070192337, "rewards/MultiModalAccuracyORM": 0.4333333410322666, "step": 2305, "train_speed(iter/s)": 0.041006 }, { "clip_ratio": 0.0, "completion_length": 9.95, "epoch": 0.9333333333333333, "grad_norm": 0.4809723496437073, "kl": 0.1289764404296875, "learning_rate": 2e-07, "loss": 0.003021649643778801, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.10833333730697632, "reward_std": 0.2071926474571228, "rewards/MultiModalAccuracyORM": 0.10833333730697632, "step": 2310, "train_speed(iter/s)": 0.041021 }, { "clip_ratio": 0.0, "completion_length": 42.75, "epoch": 0.9353535353535354, "grad_norm": 0.06879542768001556, "kl": 0.09110107421875, "learning_rate": 2e-07, "loss": -0.004359513521194458, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.47500000298023226, "reward_std": 0.2159808874130249, "rewards/MultiModalAccuracyORM": 0.47500000298023226, "step": 2315, "train_speed(iter/s)": 0.041036 }, { "clip_ratio": 0.0, "completion_length": 13.2, "epoch": 0.9373737373737374, "grad_norm": 0.226049542427063, "kl": 0.09764404296875, "learning_rate": 2e-07, "loss": 0.0010025198571383953, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.20833333805203438, "reward_std": 0.12552748322486879, "rewards/MultiModalAccuracyORM": 0.20833333805203438, "step": 2320, "train_speed(iter/s)": 0.04105 }, { "clip_ratio": 0.0, "completion_length": 6.95, "epoch": 0.9393939393939394, "grad_norm": 7.9168314933776855, "kl": 0.0877655029296875, "learning_rate": 2e-07, "loss": 0.06811027526855469, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.5333333387970924, "reward_std": 0.21149236261844634, "rewards/MultiModalAccuracyORM": 0.5333333387970924, "step": 2325, "train_speed(iter/s)": 0.041062 }, { "clip_ratio": 0.0, "completion_length": 10.2, "epoch": 0.9414141414141414, "grad_norm": 0.2699204385280609, "kl": 0.180908203125, "learning_rate": 2e-07, "loss": 0.0060350816696882244, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.27500000447034834, "reward_std": 0.09041781425476074, "rewards/MultiModalAccuracyORM": 0.27500000447034834, "step": 2330, "train_speed(iter/s)": 0.041071 }, { "clip_ratio": 0.0, "completion_length": 12.55, "epoch": 0.9434343434343434, "grad_norm": 27.749364852905273, "kl": 0.22237548828125, "learning_rate": 2e-07, "loss": 0.08456591367721558, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.29166667312383654, "reward_std": 0.1888910174369812, "rewards/MultiModalAccuracyORM": 0.29166667312383654, "step": 2335, "train_speed(iter/s)": 0.041075 }, { "clip_ratio": 0.0, "completion_length": 69.25, "epoch": 0.9454545454545454, "grad_norm": 5.552628517150879, "kl": 0.0543975830078125, "learning_rate": 2e-07, "loss": -0.05388938784599304, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.07500000149011612, "reward_std": 0.19962169826030732, "rewards/MultiModalAccuracyORM": 0.07500000149011612, "step": 2340, "train_speed(iter/s)": 0.041078 }, { "clip_ratio": 0.0, "completion_length": 7.7, "epoch": 0.9474747474747475, "grad_norm": 9.49284839630127, "kl": 0.11671142578125, "learning_rate": 2e-07, "loss": -0.00172628965228796, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.19166667014360428, "reward_std": 0.22629254460334777, "rewards/MultiModalAccuracyORM": 0.19166667014360428, "step": 2345, "train_speed(iter/s)": 0.04109 }, { "clip_ratio": 0.0, "completion_length": 56.05, "epoch": 0.9494949494949495, "grad_norm": 3.0689406394958496, "kl": 0.09317855834960938, "learning_rate": 2e-07, "loss": 0.013809925317764283, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.31666667088866235, "reward_std": 0.3167103588581085, "rewards/MultiModalAccuracyORM": 0.31666667088866235, "step": 2350, "train_speed(iter/s)": 0.041098 }, { "clip_ratio": 0.0, "completion_length": 12.8, "epoch": 0.9515151515151515, "grad_norm": 0.1557140052318573, "kl": 0.280633544921875, "learning_rate": 2e-07, "loss": -0.00421304777264595, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.41666666865348817, "reward_std": 0.07409421503543853, "rewards/MultiModalAccuracyORM": 0.41666666865348817, "step": 2355, "train_speed(iter/s)": 0.041108 }, { "clip_ratio": 0.0, "completion_length": 30.65, "epoch": 0.9535353535353536, "grad_norm": 7.580443382263184, "kl": 0.096343994140625, "learning_rate": 2e-07, "loss": -0.022874367237091065, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.6333333358168602, "reward_std": 0.17861495018005372, "rewards/MultiModalAccuracyORM": 0.6333333358168602, "step": 2360, "train_speed(iter/s)": 0.041118 }, { "clip_ratio": 0.0, "completion_length": 12.8, "epoch": 0.9555555555555556, "grad_norm": 0.11349290609359741, "kl": 0.21148681640625, "learning_rate": 2e-07, "loss": -6.924470653757453e-05, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.5166666708886624, "reward_std": 0.1840525358915329, "rewards/MultiModalAccuracyORM": 0.5166666708886624, "step": 2365, "train_speed(iter/s)": 0.041134 }, { "clip_ratio": 0.0, "completion_length": 7.8, "epoch": 0.9575757575757575, "grad_norm": 16.9438419342041, "kl": 0.20333251953125, "learning_rate": 2e-07, "loss": 0.008581924438476562, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.41666666939854624, "reward_std": 0.16225576102733613, "rewards/MultiModalAccuracyORM": 0.41666666939854624, "step": 2370, "train_speed(iter/s)": 0.041147 }, { "clip_ratio": 0.0, "completion_length": 5.7, "epoch": 0.9595959595959596, "grad_norm": 26.406293869018555, "kl": 0.1427520751953125, "learning_rate": 2e-07, "loss": 0.011251689493656158, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.32500000596046447, "reward_std": 0.23866584599018098, "rewards/MultiModalAccuracyORM": 0.32500000596046447, "step": 2375, "train_speed(iter/s)": 0.041156 }, { "clip_ratio": 0.0, "completion_length": 44.35, "epoch": 0.9616161616161616, "grad_norm": 0.03468816727399826, "kl": 0.09806137084960938, "learning_rate": 2e-07, "loss": 0.008510185778141022, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.5083333373069763, "reward_std": 0.1037161648273468, "rewards/MultiModalAccuracyORM": 0.5083333373069763, "step": 2380, "train_speed(iter/s)": 0.041159 }, { "clip_ratio": 0.0, "completion_length": 9.5, "epoch": 0.9636363636363636, "grad_norm": 12.14474105834961, "kl": 0.127783203125, "learning_rate": 2e-07, "loss": 0.031885528564453126, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.25833333656191826, "reward_std": 0.25566026866436004, "rewards/MultiModalAccuracyORM": 0.25833333656191826, "step": 2385, "train_speed(iter/s)": 0.041174 }, { "clip_ratio": 0.0, "completion_length": 19.1, "epoch": 0.9656565656565657, "grad_norm": 0.8151546716690063, "kl": 0.13538818359375, "learning_rate": 2e-07, "loss": 0.012065254151821136, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.233333333581686, "reward_std": 0.07409421503543853, "rewards/MultiModalAccuracyORM": 0.233333333581686, "step": 2390, "train_speed(iter/s)": 0.041185 }, { "clip_ratio": 0.0, "completion_length": 46.45, "epoch": 0.9676767676767677, "grad_norm": 22.97179412841797, "kl": 0.0504150390625, "learning_rate": 2e-07, "loss": 0.00892886370420456, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.32500001043081284, "reward_std": 0.386316055059433, "rewards/MultiModalAccuracyORM": 0.32500001043081284, "step": 2395, "train_speed(iter/s)": 0.041191 }, { "clip_ratio": 0.0, "completion_length": 15.7, "epoch": 0.9696969696969697, "grad_norm": 0.13443566858768463, "kl": 0.0746551513671875, "learning_rate": 2e-07, "loss": -0.008957084268331528, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.05833333507180214, "reward_std": 0.11702905893325806, "rewards/MultiModalAccuracyORM": 0.05833333507180214, "step": 2400, "train_speed(iter/s)": 0.041198 }, { "clip_ratio": 0.0, "completion_length": 5.3, "epoch": 0.9717171717171718, "grad_norm": 13.01309871673584, "kl": 0.1608978271484375, "learning_rate": 2e-07, "loss": -0.005169375985860825, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.4333333395421505, "reward_std": 0.2074468731880188, "rewards/MultiModalAccuracyORM": 0.4333333395421505, "step": 2405, "train_speed(iter/s)": 0.041211 }, { "clip_ratio": 0.0, "completion_length": 65.4, "epoch": 0.9737373737373738, "grad_norm": 20.76219367980957, "kl": 0.10498046875, "learning_rate": 2e-07, "loss": -0.026147454977035522, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3166666693985462, "reward_std": 0.23933667540550232, "rewards/MultiModalAccuracyORM": 0.3166666693985462, "step": 2410, "train_speed(iter/s)": 0.041217 }, { "clip_ratio": 0.0, "completion_length": 17.3, "epoch": 0.9757575757575757, "grad_norm": 5.97620964050293, "kl": 0.098968505859375, "learning_rate": 2e-07, "loss": 0.04436638355255127, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.5916666783392429, "reward_std": 0.26292563080787656, "rewards/MultiModalAccuracyORM": 0.5916666783392429, "step": 2415, "train_speed(iter/s)": 0.041224 }, { "clip_ratio": 0.0, "completion_length": 7.9, "epoch": 0.9777777777777777, "grad_norm": 0.16142967343330383, "kl": 0.2656707763671875, "learning_rate": 2e-07, "loss": 0.010275793075561524, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.4750000022351742, "reward_std": 0.12558708488941192, "rewards/MultiModalAccuracyORM": 0.4750000022351742, "step": 2420, "train_speed(iter/s)": 0.041233 }, { "clip_ratio": 0.0, "completion_length": 29.5, "epoch": 0.9797979797979798, "grad_norm": 5.270585060119629, "kl": 0.1023193359375, "learning_rate": 2e-07, "loss": 0.013689932227134705, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2583333417773247, "reward_std": 0.2817953139543533, "rewards/MultiModalAccuracyORM": 0.2583333417773247, "step": 2425, "train_speed(iter/s)": 0.041241 }, { "clip_ratio": 0.0, "completion_length": 24.5, "epoch": 0.9818181818181818, "grad_norm": 2.2413382530212402, "kl": 0.09530487060546874, "learning_rate": 2e-07, "loss": -0.009250025451183318, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.5166666693985462, "reward_std": 0.17150862216949464, "rewards/MultiModalAccuracyORM": 0.5166666693985462, "step": 2430, "train_speed(iter/s)": 0.041258 }, { "clip_ratio": 0.0, "completion_length": 7.1, "epoch": 0.9838383838383838, "grad_norm": 0.14606672525405884, "kl": 0.098297119140625, "learning_rate": 2e-07, "loss": -0.021257255971431733, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.28333334252238274, "reward_std": 0.19713521599769593, "rewards/MultiModalAccuracyORM": 0.28333334252238274, "step": 2435, "train_speed(iter/s)": 0.041266 }, { "clip_ratio": 0.0, "completion_length": 11.3, "epoch": 0.9858585858585859, "grad_norm": 2.6238768100738525, "kl": 0.10804595947265624, "learning_rate": 2e-07, "loss": 0.007257813215255737, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.30000000149011613, "reward_std": 0.1974250316619873, "rewards/MultiModalAccuracyORM": 0.30000000149011613, "step": 2440, "train_speed(iter/s)": 0.041271 }, { "clip_ratio": 0.0, "completion_length": 18.15, "epoch": 0.9878787878787879, "grad_norm": 0.03827716410160065, "kl": 0.101373291015625, "learning_rate": 2e-07, "loss": 0.011828117072582245, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.4916666731238365, "reward_std": 0.181566059589386, "rewards/MultiModalAccuracyORM": 0.4916666731238365, "step": 2445, "train_speed(iter/s)": 0.041278 }, { "clip_ratio": 0.0, "completion_length": 43.4, "epoch": 0.98989898989899, "grad_norm": 6.416419982910156, "kl": 0.2295166015625, "learning_rate": 2e-07, "loss": 0.009897831082344054, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3750000029802322, "reward_std": 0.10072947144508362, "rewards/MultiModalAccuracyORM": 0.3750000029802322, "step": 2450, "train_speed(iter/s)": 0.041283 }, { "clip_ratio": 0.0, "completion_length": 9.6, "epoch": 0.9919191919191919, "grad_norm": 3.0410783290863037, "kl": 0.14271240234375, "learning_rate": 2e-07, "loss": -0.015740707516670227, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.5083333380520344, "reward_std": 0.27522478699684144, "rewards/MultiModalAccuracyORM": 0.5083333380520344, "step": 2455, "train_speed(iter/s)": 0.041292 }, { "clip_ratio": 0.0, "completion_length": 6.15, "epoch": 0.9939393939393939, "grad_norm": 0.742748498916626, "kl": 0.2917930603027344, "learning_rate": 2e-07, "loss": 0.06221296787261963, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.2833333402872086, "reward_std": 0.17702035307884217, "rewards/MultiModalAccuracyORM": 0.2833333402872086, "step": 2460, "train_speed(iter/s)": 0.041301 }, { "clip_ratio": 0.0, "completion_length": 39.7, "epoch": 0.9959595959595959, "grad_norm": 0.5455455780029297, "kl": 0.1237335205078125, "learning_rate": 2e-07, "loss": 0.04647340774536133, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.5750000044703484, "reward_std": 0.09041781425476074, "rewards/MultiModalAccuracyORM": 0.5750000044703484, "step": 2465, "train_speed(iter/s)": 0.041305 }, { "clip_ratio": 0.0, "completion_length": 6.6, "epoch": 0.997979797979798, "grad_norm": 3.567203998565674, "kl": 0.128204345703125, "learning_rate": 2e-07, "loss": -0.006601794809103012, "memory(GiB)": 104.49, "response_clip_ratio": 0.0, "reward": 0.3416666753590107, "reward_std": 0.3019101768732071, "rewards/MultiModalAccuracyORM": 0.3416666753590107, "step": 2470, "train_speed(iter/s)": 0.04132 }, { "epoch": 1.0, "grad_norm": 24.7083740234375, "learning_rate": 2e-07, "loss": 0.018315188586711884, "memory(GiB)": 104.49, "step": 2475, "train_speed(iter/s)": 0.041332 }, { "epoch": 1.0, "eval_clip_ratio": 0.0, "eval_completion_length": 28.336667232513427, "eval_kl": 0.152705078125, "eval_loss": 0.011019712314009666, "eval_response_clip_ratio": 0.0, "eval_reward": 0.4650000059604645, "eval_reward_std": 0.1907379400730133, "eval_rewards/MultiModalAccuracyORM": 0.4650000059604645, "eval_runtime": 238.5041, "eval_samples_per_second": 0.21, "eval_steps_per_second": 0.021, "step": 2475 } ], "logging_steps": 5, "max_steps": 2475, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }